Make UpperBidiagonalization accept row-major matrices (bug #769 )

* Give temporary workspace the same storage order as original matrix * Take storage order into account when determining inner stride of rows and columns * Change one test to use a row-major matrix.
Fix bug #852 : define Traits type in general_matrix_matrix_product when EIGEN_USE_BLAS is defined
2026-04-10 11:34:33 +08:00 · 2014-09-12 14:52:35 +01:00 · 2014-08-08 04:05:28 -04:00 · 2014-09-11 10:27:46 +02:00 · 2014-09-10 23:11:58 +02:00 · 2014-09-10 23:10:01 +02:00
489 changed files with 23664 additions and 20972 deletions
--- a/.hgeol
+++ b/.hgeol
@@ -1,6 +1,9 @@
 [patterns]
+*.sh = LF
+*.MINPACK = CRLF
 scripts/*.in = LF
 debug/msvc/*.dat = CRLF
+debug/msvc/*.natvis = CRLF
 unsupported/test/mpreal/*.* = CRLF
 ** = native

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -108,7 +108,8 @@ endif()
 set(EIGEN_TEST_MAX_SIZE "320" CACHE STRING "Maximal matrix/vector size, default is 320")

 macro(ei_add_cxx_compiler_flag FLAG)
-  string(REGEX REPLACE "-" "" SFLAG ${FLAG})
+  string(REGEX REPLACE "-" "" SFLAG1 ${FLAG})
+  string(REGEX REPLACE "\\+" "p" SFLAG ${SFLAG1})
  check_cxx_compiler_flag(${FLAG} COMPILER_SUPPORT_${SFLAG})
  if(COMPILER_SUPPORT_${SFLAG})
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG}")
@@ -118,7 +119,7 @@ endmacro(ei_add_cxx_compiler_flag)
 if(NOT MSVC)
  # We assume that other compilers are partly compatible with GNUCC
  
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
+#  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
  set(CMAKE_CXX_FLAGS_DEBUG "-g3")
  set(CMAKE_CXX_FLAGS_RELEASE "-g0 -O2")
  
@@ -142,6 +143,9 @@ if(NOT MSVC)
  ei_add_cxx_compiler_flag("-Wpointer-arith")
  ei_add_cxx_compiler_flag("-Wwrite-strings")
  ei_add_cxx_compiler_flag("-Wformat-security")
+  ei_add_cxx_compiler_flag("-Wshorten-64-to-32")
+  ei_add_cxx_compiler_flag("-Wenum-conversion")
+  ei_add_cxx_compiler_flag("-Wc++11-extensions")
  
  ei_add_cxx_compiler_flag("-Wno-psabi")
  ei_add_cxx_compiler_flag("-Wno-variadic-macros")
@@ -153,6 +157,7 @@ if(NOT MSVC)
  ei_add_cxx_compiler_flag("-wd981")                    # disable ICC's "operands are evaluated in unspecified order" remark
  ei_add_cxx_compiler_flag("-wd2304")                   # disbale ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor
  
+  
  # The -ansi flag must be added last, otherwise it is also used as a linker flag by check_cxx_compiler_flag making it fails
  # Moreover we should not set both -strict-ansi and -ansi
  check_cxx_compiler_flag("-strict-ansi" COMPILER_SUPPORT_STRICTANSI)
@@ -196,6 +201,18 @@ if(NOT MSVC)
    message(STATUS "Enabling SSE4.2 in tests/examples")
  endif()

+  option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" OFF)
+  if(EIGEN_TEST_AVX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
+    message(STATUS "Enabling AVX in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_FMA "Enable/Disable FMA in tests/examples" OFF)
+  if(EIGEN_TEST_FMA)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
+    message(STATUS "Enabling FMA in tests/examples")
+  endif()
+
  option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF)
  if(EIGEN_TEST_ALTIVEC)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
@@ -204,7 +221,7 @@ if(NOT MSVC)

  option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
  if(EIGEN_TEST_NEON)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -mcpu=cortex-a"8)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -mcpu=cortex-a8")
    message(STATUS "Enabling NEON in tests/examples")
  endif()

@@ -284,6 +301,12 @@ if(EIGEN_TEST_NO_EXPLICIT_ALIGNMENT)
  message(STATUS "Disabling alignment in tests/examples")
 endif()

+option(EIGEN_TEST_NO_EXCEPTIONS "Disables C++ exceptions" OFF)
+if(EIGEN_TEST_NO_EXCEPTIONS)
+  ei_add_cxx_compiler_flag("-fno-exceptions")
+  message(STATUS "Disabling exceptions in tests/examples")
+endif()
+
 option(EIGEN_TEST_C++0x "Enables all C++0x features." OFF)

 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
@@ -418,3 +441,31 @@ else()
 endif()

 message(STATUS "")
+
+set ( EIGEN_CONFIG_CMAKE_PATH
+      lib${LIB_SUFFIX}/cmake/eigen3
+      CACHE PATH "The directory where the CMake files are installed"
+    )
+if ( NOT IS_ABSOLUTE EIGEN_CONFIG_CMAKE_PATH )
+  set ( EIGEN_CONFIG_CMAKE_PATH ${CMAKE_INSTALL_PREFIX}/${EIGEN_CONFIG_CMAKE_PATH} )
+endif ()
+
+set ( EIGEN_USE_FILE ${EIGEN_CONFIG_CMAKE_PATH}/UseEigen3.cmake )
+set ( EIGEN_VERSION_STRING ${EIGEN_VERSION_NUMBER} )
+set ( EIGEN_VERSION_MAJOR  ${EIGEN_WORLD_VERSION} )
+set ( EIGEN_VERSION_MINOR  ${EIGEN_MAJOR_VERSION} )
+set ( EIGEN_VERSION_PATCH  ${EIGEN_MINOR_VERSION} )
+set ( EIGEN_DEFINITIONS "")
+set ( EIGEN_INCLUDE_DIR ${INCLUDE_INSTALL_DIR} )
+set ( EIGEN_INCLUDE_DIRS ${EIGEN_INCLUDE_DIR} )
+set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} )
+
+configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
+                 ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
+                 @ONLY ESCAPE_QUOTES
+               )
+
+install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
+                ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
+          DESTINATION ${EIGEN_CONFIG_CMAKE_PATH}
+        )
--- a/Eigen/Array
+++ b/Eigen/Array
@@ -1,11 +0,0 @@
-#ifndef EIGEN_ARRAY_MODULE_H
-#define EIGEN_ARRAY_MODULE_H
-
-// include Core first to handle Eigen2 support macros
-#include "Core"
-
-#ifndef EIGEN2_SUPPORT
-  #error The Eigen/Array header does no longer exist in Eigen3. All that functionality has moved to Eigen/Core.
-#endif
-
-#endif // EIGEN_ARRAY_MODULE_H
--- a/Eigen/Cholesky
+++ b/Eigen/Cholesky
@@ -10,9 +10,11 @@
  *
  *
  * This module provides two variants of the Cholesky decomposition for selfadjoint (hermitian) matrices.
-  * Those decompositions are accessible via the following MatrixBase methods:
-  *  - MatrixBase::llt(),
+  * Those decompositions are also accessible via the following methods:
+  *  - MatrixBase::llt()
  *  - MatrixBase::ldlt()
+  *  - SelfAdjointView::llt()
+  *  - SelfAdjointView::ldlt()
  *
  * \code
  * #include <Eigen/Cholesky>
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -14,6 +14,42 @@
 // first thing Eigen does: stop the compiler from committing suicide
 #include "src/Core/util/DisableStupidWarnings.h"

+// Handle NVCC/CUDA
+#ifdef __CUDACC__
+  // Do not try asserts on CUDA!
+  #ifndef EIGEN_NO_DEBUG
+  #define EIGEN_NO_DEBUG
+  #endif
+
+  #ifdef EIGEN_INTERNAL_DEBUGGING
+  #undef EIGEN_INTERNAL_DEBUGGING
+  #endif
+  
+  // Do not try to vectorize on CUDA!
+  #define EIGEN_DONT_VECTORIZE
+  
+  // All functions callable from CUDA code must be qualified with __device__
+  #define EIGEN_DEVICE_FUNC __host__ __device__
+  
+#else
+  #define EIGEN_DEVICE_FUNC
+  
+#endif
+
+#if defined(__CUDA_ARCH__)
+  #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
+#else
+  #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
+#endif
+
+#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS)
+  #define EIGEN_EXCEPTIONS
+#endif
+
+#ifdef EIGEN_EXCEPTIONS
+  #include <new>
+#endif
+
 // then include this file where all our macros are defined. It's really important to do it first because
 // it's where we do all the alignment settings (platform detection and honoring the user's will if he
 // defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization.
@@ -82,7 +118,16 @@
    #ifdef __SSE4_2__
      #define EIGEN_VECTORIZE_SSE4_2
    #endif
-
+    #ifdef __AVX__
+      #define EIGEN_VECTORIZE_AVX
+      #define EIGEN_VECTORIZE_SSE3
+      #define EIGEN_VECTORIZE_SSSE3
+      #define EIGEN_VECTORIZE_SSE4_1
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #ifdef __FMA__
+      #define EIGEN_VECTORIZE_FMA
+    #endif
    // include files

    // This extern "C" works around a MINGW-w64 compilation issue
@@ -95,7 +140,7 @@
    extern "C" {
      // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
      // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
-      #ifdef __INTEL_COMPILER
+      #if defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1110
        #include <immintrin.h>
      #else
        #include <emmintrin.h>
@@ -112,6 +157,9 @@
        #ifdef EIGEN_VECTORIZE_SSE4_2
        #include <nmmintrin.h>
        #endif
+        #ifdef EIGEN_VECTORIZE_AVX
+        #include <immintrin.h>
+        #endif
      #endif
    } // end extern "C"
  #elif defined __ALTIVEC__
@@ -165,23 +213,17 @@
 #endif

 // required for __cpuid, needs to be included after cmath
-#if defined(_MSC_VER) && (defined(_M_IX86)||defined(_M_X64))
+#if defined(_MSC_VER) && (defined(_M_IX86)||defined(_M_X64)) && (!defined(_WIN32_WCE))
  #include <intrin.h>
 #endif

-#if defined(_CPPUNWIND) || defined(__EXCEPTIONS)
-  #define EIGEN_EXCEPTIONS
-#endif
-
-#ifdef EIGEN_EXCEPTIONS
-  #include <new>
-#endif
-
 /** \brief Namespace containing all symbols from the %Eigen library. */
 namespace Eigen {

 inline static const char *SimdInstructionSetsInUse(void) {
-#if defined(EIGEN_VECTORIZE_SSE4_2)
+#if defined(EIGEN_VECTORIZE_AVX)
+  return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_2)
  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
 #elif defined(EIGEN_VECTORIZE_SSE4_1)
  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
@@ -202,34 +244,9 @@ inline static const char *SimdInstructionSetsInUse(void) {

 } // end namespace Eigen

-#define STAGE10_FULL_EIGEN2_API             10
-#define STAGE20_RESOLVE_API_CONFLICTS       20
-#define STAGE30_FULL_EIGEN3_API             30
-#define STAGE40_FULL_EIGEN3_STRICTNESS      40
-#define STAGE99_NO_EIGEN2_SUPPORT           99
-
-#if   defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS
-  #define EIGEN2_SUPPORT
-  #define EIGEN2_SUPPORT_STAGE STAGE40_FULL_EIGEN3_STRICTNESS
-#elif defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API
-  #define EIGEN2_SUPPORT
-  #define EIGEN2_SUPPORT_STAGE STAGE30_FULL_EIGEN3_API
-#elif defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS
-  #define EIGEN2_SUPPORT
-  #define EIGEN2_SUPPORT_STAGE STAGE20_RESOLVE_API_CONFLICTS
-#elif defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API
-  #define EIGEN2_SUPPORT
-  #define EIGEN2_SUPPORT_STAGE STAGE10_FULL_EIGEN2_API
-#elif defined EIGEN2_SUPPORT
-  // default to stage 3, that's what it's always meant
-  #define EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API
-  #define EIGEN2_SUPPORT_STAGE STAGE30_FULL_EIGEN3_API
-#else
-  #define EIGEN2_SUPPORT_STAGE STAGE99_NO_EIGEN2_SUPPORT
-#endif
-
-#ifdef EIGEN2_SUPPORT
-#undef minor
+#if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT
+// This will generate an error message:
+#error Eigen2-support is only available up to version 3.2. Please go to "http://eigen.tuxfamily.org/index.php?title=Eigen2" for further information
 #endif

 // we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
@@ -259,7 +276,13 @@ using std::ptrdiff_t;
 #include "src/Core/MathFunctions.h"
 #include "src/Core/GenericPacketMath.h"

-#if defined EIGEN_VECTORIZE_SSE
+#if defined EIGEN_VECTORIZE_AVX
+  // Use AVX for floats and doubles, SSE for integers
+  #include "src/Core/arch/SSE/PacketMath.h"
+  #include "src/Core/arch/SSE/Complex.h"
+  #include "src/Core/arch/AVX/PacketMath.h"
+  #include "src/Core/arch/AVX/Complex.h"
+#elif defined EIGEN_VECTORIZE_SSE
  #include "src/Core/arch/SSE/PacketMath.h"
  #include "src/Core/arch/SSE/MathFunctions.h"
  #include "src/Core/arch/SSE/Complex.h"
@@ -273,17 +296,30 @@ using std::ptrdiff_t;

 #include "src/Core/arch/Default/Settings.h"

-#include "src/Core/Functors.h"
+#include "src/Core/functors/BinaryFunctors.h"
+#include "src/Core/functors/UnaryFunctors.h"
+#include "src/Core/functors/NullaryFunctors.h"
+#include "src/Core/functors/StlFunctors.h"
+
 #include "src/Core/DenseCoeffsBase.h"
 #include "src/Core/DenseBase.h"
 #include "src/Core/MatrixBase.h"
 #include "src/Core/EigenBase.h"

+#ifdef EIGEN_ENABLE_EVALUATORS
+#include "src/Core/functors/AssignmentFunctors.h"
+#include "src/Core/Product.h"
+#include "src/Core/CoreEvaluators.h"
+#include "src/Core/AssignEvaluator.h"
+#include "src/Core/ProductEvaluators.h"
+#endif
+
 #ifndef EIGEN_PARSED_BY_DOXYGEN // work around Doxygen bug triggered by Assign.h r814874
                                // at least confirmed with Doxygen 1.5.5 and 1.5.6
  #include "src/Core/Assign.h"
 #endif

+#include "src/Core/ArrayBase.h"
 #include "src/Core/util/BlasUtil.h"
 #include "src/Core/DenseStorage.h"
 #include "src/Core/NestByValue.h"
@@ -347,7 +383,6 @@ using std::ptrdiff_t;
 #include "src/Core/Random.h"
 #include "src/Core/Replicate.h"
 #include "src/Core/Reverse.h"
-#include "src/Core/ArrayBase.h"
 #include "src/Core/ArrayWrapper.h"

 #ifdef EIGEN_USE_BLAS
@@ -369,8 +404,4 @@ using std::ptrdiff_t;

 #include "src/Core/util/ReenableStupidWarnings.h"

-#ifdef EIGEN2_SUPPORT
-#include "Eigen2Support"
-#endif
-
 #endif // EIGEN_CORE_H
--- a/Eigen/Eigen
+++ b/Eigen/Eigen
@@ -1,2 +1,2 @@
 #include "Dense"
-//#include "Sparse"
+#include "Sparse"
--- a/Eigen/Eigen2Support
+++ b/Eigen/Eigen2Support
@@ -1,82 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2SUPPORT_H
-#define EIGEN2SUPPORT_H
-
-#if (!defined(EIGEN2_SUPPORT)) || (!defined(EIGEN_CORE_H))
-#error Eigen2 support must be enabled by defining EIGEN2_SUPPORT before including any Eigen header
-#endif
-
-#include "src/Core/util/DisableStupidWarnings.h"
-
-/** \ingroup Support_modules
-  * \defgroup Eigen2Support_Module Eigen2 support module
-  * This module provides a couple of deprecated functions improving the compatibility with Eigen2.
-  *
-  * To use it, define EIGEN2_SUPPORT before including any Eigen header
-  * \code
-  * #define EIGEN2_SUPPORT
-  * \endcode
-  *
-  */
-
-#include "src/Eigen2Support/Macros.h"
-#include "src/Eigen2Support/Memory.h"
-#include "src/Eigen2Support/Meta.h"
-#include "src/Eigen2Support/Lazy.h"
-#include "src/Eigen2Support/Cwise.h"
-#include "src/Eigen2Support/CwiseOperators.h"
-#include "src/Eigen2Support/TriangularSolver.h"
-#include "src/Eigen2Support/Block.h"
-#include "src/Eigen2Support/VectorBlock.h"
-#include "src/Eigen2Support/Minor.h"
-#include "src/Eigen2Support/MathFunctions.h"
-
-
-#include "src/Core/util/ReenableStupidWarnings.h"
-
-// Eigen2 used to include iostream
-#include<iostream>
-
-#define EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \
-using Eigen::Matrix##SizeSuffix##TypeSuffix; \
-using Eigen::Vector##SizeSuffix##TypeSuffix; \
-using Eigen::RowVector##SizeSuffix##TypeSuffix;
-
-#define EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(TypeSuffix) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 2) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 3) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 4) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, X) \
-
-#define EIGEN_USING_MATRIX_TYPEDEFS \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(i) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(f) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(d) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(cf) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(cd)
-
-#define USING_PART_OF_NAMESPACE_EIGEN \
-EIGEN_USING_MATRIX_TYPEDEFS \
-using Eigen::Matrix; \
-using Eigen::MatrixBase; \
-using Eigen::ei_random; \
-using Eigen::ei_real; \
-using Eigen::ei_imag; \
-using Eigen::ei_conj; \
-using Eigen::ei_abs; \
-using Eigen::ei_abs2; \
-using Eigen::ei_sqrt; \
-using Eigen::ei_exp; \
-using Eigen::ei_log; \
-using Eigen::ei_sin; \
-using Eigen::ei_cos;
-
-#endif // EIGEN2SUPPORT_H
--- a/Eigen/Geometry
+++ b/Eigen/Geometry
@@ -33,27 +33,23 @@
 #include "src/Geometry/OrthoMethods.h"
 #include "src/Geometry/EulerAngles.h"

-#if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
-  #include "src/Geometry/Homogeneous.h"
-  #include "src/Geometry/RotationBase.h"
-  #include "src/Geometry/Rotation2D.h"
-  #include "src/Geometry/Quaternion.h"
-  #include "src/Geometry/AngleAxis.h"
-  #include "src/Geometry/Transform.h"
-  #include "src/Geometry/Translation.h"
-  #include "src/Geometry/Scaling.h"
-  #include "src/Geometry/Hyperplane.h"
-  #include "src/Geometry/ParametrizedLine.h"
-  #include "src/Geometry/AlignedBox.h"
-  #include "src/Geometry/Umeyama.h"
+#include "src/Geometry/Homogeneous.h"
+#include "src/Geometry/RotationBase.h"
+#include "src/Geometry/Rotation2D.h"
+#include "src/Geometry/Quaternion.h"
+#include "src/Geometry/AngleAxis.h"
+#include "src/Geometry/Transform.h"
+#include "src/Geometry/Translation.h"
+#include "src/Geometry/Scaling.h"
+#include "src/Geometry/Hyperplane.h"
+#include "src/Geometry/ParametrizedLine.h"
+#include "src/Geometry/AlignedBox.h"
+#include "src/Geometry/Umeyama.h"

-  #if defined EIGEN_VECTORIZE_SSE
-    #include "src/Geometry/arch/Geometry_SSE.h"
-  #endif
-#endif
-
-#ifdef EIGEN2_SUPPORT
-#include "src/Eigen2Support/Geometry/All.h"
+// Use the SSE optimized version whenever possible. At the moment the
+// SSE version doesn't compile when AVX is enabled
+#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
+#include "src/Geometry/arch/Geometry_SSE.h"
 #endif

 #include "src/Core/util/ReenableStupidWarnings.h"
--- a/Eigen/LU
+++ b/Eigen/LU
@@ -27,14 +27,12 @@
 #include "src/LU/Determinant.h"
 #include "src/LU/Inverse.h"

-#if defined EIGEN_VECTORIZE_SSE
+// Use the SSE optimized version whenever possible. At the moment the
+// SSE version doesn't compile when AVX is enabled
+#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
  #include "src/LU/arch/Inverse_SSE.h"
 #endif

-#ifdef EIGEN2_SUPPORT
-  #include "src/Eigen2Support/LU.h"
-#endif
-
 #include "src/Core/util/ReenableStupidWarnings.h"

 #endif // EIGEN_LU_MODULE_H
--- a/Eigen/LeastSquares
+++ b/Eigen/LeastSquares
@@ -1,32 +0,0 @@
-#ifndef EIGEN_REGRESSION_MODULE_H
-#define EIGEN_REGRESSION_MODULE_H
-
-#ifndef EIGEN2_SUPPORT
-#error LeastSquares is only available in Eigen2 support mode (define EIGEN2_SUPPORT)
-#endif
-
-// exclude from normal eigen3-only documentation
-#ifdef EIGEN2_SUPPORT
-
-#include "Core"
-
-#include "src/Core/util/DisableStupidWarnings.h"
-
-#include "Eigenvalues"
-#include "Geometry"
-
-/** \defgroup LeastSquares_Module LeastSquares module
-  * This module provides linear regression and related features.
-  *
-  * \code
-  * #include <Eigen/LeastSquares>
-  * \endcode
-  */
-
-#include "src/Eigen2Support/LeastSquares.h"
-
-#include "src/Core/util/ReenableStupidWarnings.h"
-
-#endif // EIGEN2_SUPPORT
-
-#endif // EIGEN_REGRESSION_MODULE_H
--- a/Eigen/QR
+++ b/Eigen/QR
@@ -15,7 +15,9 @@
  *
  * This module provides various QR decompositions
  * This module also provides some MatrixBase methods, including:
-  *  - MatrixBase::qr(),
+  *  - MatrixBase::householderQr()
+  *  - MatrixBase::colPivHouseholderQr()
+  *  - MatrixBase::fullPivHouseholderQr()
  *
  * \code
  * #include <Eigen/QR>
@@ -31,15 +33,7 @@
 #include "src/QR/ColPivHouseholderQR_MKL.h"
 #endif

-#ifdef EIGEN2_SUPPORT
-#include "src/Eigen2Support/QR.h"
-#endif
-
 #include "src/Core/util/ReenableStupidWarnings.h"

-#ifdef EIGEN2_SUPPORT
-#include "Eigenvalues"
-#endif
-
 #endif // EIGEN_QR_MODULE_H
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
--- a/Eigen/SVD
+++ b/Eigen/SVD
@@ -21,16 +21,13 @@
  */

 #include "src/misc/Solve.h"
+#include "src/SVD/SVDBase.h"
 #include "src/SVD/JacobiSVD.h"
 #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
 #include "src/SVD/JacobiSVD_MKL.h"
 #endif
 #include "src/SVD/UpperBidiagonalization.h"

-#ifdef EIGEN2_SUPPORT
-#include "src/Eigen2Support/SVD.h"
-#endif
-
 #include "src/Core/util/ReenableStupidWarnings.h"

 #endif // EIGEN_SVD_MODULE_H
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -16,7 +16,10 @@
 namespace Eigen { 

 namespace internal {
-template<typename MatrixType, int UpLo> struct LDLT_Traits;
+  template<typename MatrixType, int UpLo> struct LDLT_Traits;
+
+  // PositiveSemiDef means positive semi-definite and non-zero; same for NegativeSemiDef
+  enum SignMatrix { PositiveSemiDef, NegativeSemiDef, ZeroSign, Indefinite };
 }

 /** \ingroup Cholesky_Module
@@ -40,7 +43,7 @@ template<typename MatrixType, int UpLo> struct LDLT_Traits;
  * Remember that Cholesky decompositions are not rank-revealing. Also, do not use a Cholesky
  * decomposition to determine whether a system of equations has a solution.
  *
-  * \sa MatrixBase::ldlt(), class LLT
+  * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
  */
 template<typename _MatrixType, int _UpLo> class LDLT
 {
@@ -69,7 +72,12 @@ template<typename _MatrixType, int _UpLo> class LDLT
      * The default constructor is useful in cases in which the user intends to
      * perform decompositions via LDLT::compute(const MatrixType&).
      */
-    LDLT() : m_matrix(), m_transpositions(), m_isInitialized(false) {}
+    LDLT() 
+      : m_matrix(), 
+        m_transpositions(), 
+        m_sign(internal::ZeroSign),
+        m_isInitialized(false) 
+    {}

    /** \brief Default Constructor with memory preallocation
      *
@@ -81,6 +89,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
      : m_matrix(size, size),
        m_transpositions(size),
        m_temporary(size),
+        m_sign(internal::ZeroSign),
        m_isInitialized(false)
    {}

@@ -93,6 +102,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
      : m_matrix(matrix.rows(), matrix.cols()),
        m_transpositions(matrix.rows()),
        m_temporary(matrix.rows()),
+        m_sign(internal::ZeroSign),
        m_isInitialized(false)
    {
      compute(matrix);
@@ -139,21 +149,14 @@ template<typename _MatrixType, int _UpLo> class LDLT
    inline bool isPositive() const
    {
      eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      return m_sign == 1;
+      return m_sign == internal::PositiveSemiDef || m_sign == internal::ZeroSign;
    }
-    
-    #ifdef EIGEN2_SUPPORT
-    inline bool isPositiveDefinite() const
-    {
-      return isPositive();
-    }
-    #endif

    /** \returns true if the matrix is negative (semidefinite) */
    inline bool isNegative(void) const
    {
      eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      return m_sign == -1;
+      return m_sign == internal::NegativeSemiDef || m_sign == internal::ZeroSign;
    }

    /** \returns a solution x of \f$ A x = b \f$ using the current decomposition of A.
@@ -169,7 +172,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
      * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function
      * computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular.
      *
-      * \sa MatrixBase::ldlt()
+      * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt()
      */
    template<typename Rhs>
    inline const internal::solve_retval<LDLT, Rhs>
@@ -181,15 +184,6 @@ template<typename _MatrixType, int _UpLo> class LDLT
      return internal::solve_retval<LDLT, Rhs>(*this, b.derived());
    }

-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived, typename ResultType>
-    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
-    {
-      *result = this->solve(b);
-      return true;
-    }
-    #endif
-
    template<typename Derived>
    bool solveInPlace(MatrixBase<Derived> &bAndX) const;

@@ -235,7 +229,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
    MatrixType m_matrix;
    TranspositionType m_transpositions;
    TmpMatrixType m_temporary;
-    int m_sign;
+    internal::SignMatrix m_sign;
    bool m_isInitialized;
 };

@@ -246,49 +240,33 @@ template<int UpLo> struct ldlt_inplace;
 template<> struct ldlt_inplace<Lower>
 {
  template<typename MatrixType, typename TranspositionType, typename Workspace>
-  static bool unblocked(MatrixType& mat, TranspositionType& transpositions, Workspace& temp, int* sign=0)
+  static bool unblocked(MatrixType& mat, TranspositionType& transpositions, Workspace& temp, SignMatrix& sign)
  {
    using std::abs;
    typedef typename MatrixType::Scalar Scalar;
    typedef typename MatrixType::RealScalar RealScalar;
    typedef typename MatrixType::Index Index;
+    typedef typename TranspositionType::StorageIndexType IndexType;
    eigen_assert(mat.rows()==mat.cols());
    const Index size = mat.rows();

    if (size <= 1)
    {
      transpositions.setIdentity();
-      if(sign)
-        *sign = numext::real(mat.coeff(0,0))>0 ? 1:-1;
+      if (numext::real(mat.coeff(0,0)) > 0) sign = PositiveSemiDef;
+      else if (numext::real(mat.coeff(0,0)) < 0) sign = NegativeSemiDef;
+      else sign = ZeroSign;
      return true;
    }

-    RealScalar cutoff(0), biggest_in_corner;
-
    for (Index k = 0; k < size; ++k)
    {
      // Find largest diagonal element
      Index index_of_biggest_in_corner;
-      biggest_in_corner = mat.diagonal().tail(size-k).cwiseAbs().maxCoeff(&index_of_biggest_in_corner);
+      mat.diagonal().tail(size-k).cwiseAbs().maxCoeff(&index_of_biggest_in_corner);
      index_of_biggest_in_corner += k;

-      if(k == 0)
-      {
-        // The biggest overall is the point of reference to which further diagonals
-        // are compared; if any diagonal is negligible compared
-        // to the largest overall, the algorithm bails.
-        cutoff = abs(NumTraits<Scalar>::epsilon() * biggest_in_corner);
-      }
-
-      // Finish early if the matrix is not full rank.
-      if(biggest_in_corner < cutoff)
-      {
-        for(Index i = k; i < size; i++) transpositions.coeffRef(i) = i;
-        if(sign) *sign = 0;
-        break;
-      }
-
-      transpositions.coeffRef(k) = index_of_biggest_in_corner;
+      transpositions.coeffRef(k) = IndexType(index_of_biggest_in_corner);
      if(k != index_of_biggest_in_corner)
      {
        // apply the transposition while taking care to consider only
@@ -297,7 +275,7 @@ template<> struct ldlt_inplace<Lower>
        mat.row(k).head(k).swap(mat.row(index_of_biggest_in_corner).head(k));
        mat.col(k).tail(s).swap(mat.col(index_of_biggest_in_corner).tail(s));
        std::swap(mat.coeffRef(k,k),mat.coeffRef(index_of_biggest_in_corner,index_of_biggest_in_corner));
-        for(int i=k+1;i<index_of_biggest_in_corner;++i)
+        for(Index i=k+1;i<index_of_biggest_in_corner;++i)
        {
          Scalar tmp = mat.coeffRef(i,k);
          mat.coeffRef(i,k) = numext::conj(mat.coeffRef(index_of_biggest_in_corner,i));
@@ -318,22 +296,27 @@ template<> struct ldlt_inplace<Lower>

      if(k>0)
      {
-        temp.head(k) = mat.diagonal().head(k).asDiagonal() * A10.adjoint();
+        temp.head(k) = mat.diagonal().real().head(k).asDiagonal() * A10.adjoint();
        mat.coeffRef(k,k) -= (A10 * temp.head(k)).value();
        if(rs>0)
          A21.noalias() -= A20 * temp.head(k);
      }
-      if((rs>0) && (abs(mat.coeffRef(k,k)) > cutoff))
-        A21 /= mat.coeffRef(k,k);
      
-      if(sign)
-      {
-        // LDLT is not guaranteed to work for indefinite matrices, but let's try to get the sign right
-        int newSign = numext::real(mat.diagonal().coeff(index_of_biggest_in_corner)) > 0;
-        if(k == 0)
-          *sign = newSign;
-        else if(*sign != newSign)
-          *sign = 0;
+      // In some previous versions of Eigen (e.g., 3.2.1), the scaling was omitted if the pivot
+      // was smaller than the cutoff value. However, soince LDLT is not rank-revealing
+      // we should only make sure we do not introduce INF or NaN values.
+      // LAPACK also uses 0 as the cutoff value.
+      RealScalar realAkk = numext::real(mat.coeffRef(k,k));
+      if((rs>0) && (abs(realAkk) > RealScalar(0)))
+        A21 /= realAkk;
+
+      if (sign == PositiveSemiDef) {
+        if (realAkk < 0) sign = Indefinite;
+      } else if (sign == NegativeSemiDef) {
+        if (realAkk > 0) sign = Indefinite;
+      } else if (sign == ZeroSign) {
+        if (realAkk > 0) sign = PositiveSemiDef;
+        else if (realAkk < 0) sign = NegativeSemiDef;
      }
    }

@@ -399,7 +382,7 @@ template<> struct ldlt_inplace<Lower>
 template<> struct ldlt_inplace<Upper>
 {
  template<typename MatrixType, typename TranspositionType, typename Workspace>
-  static EIGEN_STRONG_INLINE bool unblocked(MatrixType& mat, TranspositionType& transpositions, Workspace& temp, int* sign=0)
+  static EIGEN_STRONG_INLINE bool unblocked(MatrixType& mat, TranspositionType& transpositions, Workspace& temp, SignMatrix& sign)
  {
    Transpose<MatrixType> matt(mat);
    return ldlt_inplace<Lower>::unblocked(matt, transpositions, temp, sign);
@@ -445,7 +428,7 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const MatrixType& a)
  m_isInitialized = false;
  m_temporary.resize(size);

-  internal::ldlt_inplace<UpLo>::unblocked(m_matrix, m_transpositions, m_temporary, &m_sign);
+  internal::ldlt_inplace<UpLo>::unblocked(m_matrix, m_transpositions, m_temporary, m_sign);

  m_isInitialized = true;
  return *this;
@@ -460,6 +443,7 @@ template<typename MatrixType, int _UpLo>
 template<typename Derived>
 LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Derived>& w, const typename NumTraits<typename MatrixType::Scalar>::Real& sigma)
 {
+  typedef typename TranspositionType::StorageIndexType IndexType;
  const Index size = w.rows();
  if (m_isInitialized)
  {
@@ -471,9 +455,9 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Deri
    m_matrix.setZero();
    m_transpositions.resize(size);
    for (Index i = 0; i < size; i++)
-      m_transpositions.coeffRef(i) = i;
+      m_transpositions.coeffRef(i) = IndexType(i);
    m_temporary.resize(size);
-    m_sign = sigma>=0 ? 1 : -1;
+    m_sign = sigma>=0 ? internal::PositiveSemiDef : internal::NegativeSemiDef;
    m_isInitialized = true;
  }

@@ -502,18 +486,22 @@ struct solve_retval<LDLT<_MatrixType,_UpLo>, Rhs>
    // dst = D^-1 (L^-1 P b)
    // more precisely, use pseudo-inverse of D (see bug 241)
    using std::abs;
-    using std::max;
+    EIGEN_USING_STD_MATH(max);
    typedef typename LDLTType::MatrixType MatrixType;
-    typedef typename LDLTType::Scalar Scalar;
    typedef typename LDLTType::RealScalar RealScalar;
-    const Diagonal<const MatrixType> vectorD = dec().vectorD();
-    RealScalar tolerance = (max)(vectorD.array().abs().maxCoeff() * NumTraits<Scalar>::epsilon(),
-				 RealScalar(1) / NumTraits<RealScalar>::highest()); // motivated by LAPACK's xGELSS
+    const typename Diagonal<const MatrixType>::RealReturnType vectorD(dec().vectorD());
+    // In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon
+    // as motivated by LAPACK's xGELSS:
+    // RealScalar tolerance = (max)(vectorD.array().abs().maxCoeff() *NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
+    // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
+    // diagonal element is not well justified and to numerical issues in some cases.
+    // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
+    RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
    for (Index i = 0; i < vectorD.size(); ++i) {
      if(abs(vectorD(i)) > tolerance)
-	dst.row(i) /= vectorD(i);
+        dst.row(i) /= vectorD(i);
      else
-	dst.row(i).setZero();
+        dst.row(i).setZero();
    }

    // dst = L^-T (D^-1 L^-1 P b)
@@ -566,7 +554,7 @@ MatrixType LDLT<MatrixType,_UpLo>::reconstructedMatrix() const
  // L^* P
  res = matrixU() * res;
  // D(L^*P)
-  res = vectorD().asDiagonal() * res;
+  res = vectorD().real().asDiagonal() * res;
  // L(DL^*P)
  res = matrixL() * res;
  // P^T (LDL^*P)
@@ -575,8 +563,10 @@ MatrixType LDLT<MatrixType,_UpLo>::reconstructedMatrix() const
  return res;
 }

+#ifndef __CUDACC__
 /** \cholesky_module
  * \returns the Cholesky decomposition with full pivoting without square root of \c *this
+  * \sa MatrixBase::ldlt()
  */
 template<typename MatrixType, unsigned int UpLo>
 inline const LDLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
@@ -587,6 +577,7 @@ SelfAdjointView<MatrixType, UpLo>::ldlt() const

 /** \cholesky_module
  * \returns the Cholesky decomposition with full pivoting without square root of \c *this
+  * \sa SelfAdjointView::ldlt()
  */
 template<typename Derived>
 inline const LDLT<typename MatrixBase<Derived>::PlainObject>
@@ -594,6 +585,7 @@ MatrixBase<Derived>::ldlt() const
 {
  return LDLT<PlainObject>(derived());
 }
+#endif // __CUDACC__

 } // end namespace Eigen

--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -41,7 +41,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
  * Example: \include LLT_example.cpp
  * Output: \verbinclude LLT_example.out
  *    
-  * \sa MatrixBase::llt(), class LDLT
+  * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
  */
 /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
  * Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
@@ -115,7 +115,7 @@ template<typename _MatrixType, int _UpLo> class LLT
      * Example: \include LLT_solve.cpp
      * Output: \verbinclude LLT_solve.out
      *
-      * \sa solveInPlace(), MatrixBase::llt()
+      * \sa solveInPlace(), MatrixBase::llt(), SelfAdjointView::llt()
      */
    template<typename Rhs>
    inline const internal::solve_retval<LLT, Rhs>
@@ -127,17 +127,6 @@ template<typename _MatrixType, int _UpLo> class LLT
      return internal::solve_retval<LLT, Rhs>(*this, b.derived());
    }

-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived, typename ResultType>
-    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
-    {
-      *result = this->solve(b);
-      return true;
-    }
-    
-    bool isPositiveDefinite() const { return true; }
-    #endif
-
    template<typename Derived>
    void solveInPlace(MatrixBase<Derived> &bAndX) const;

@@ -465,8 +454,10 @@ MatrixType LLT<MatrixType,_UpLo>::reconstructedMatrix() const
  return matrixL() * matrixL().adjoint().toDenseMatrix();
 }

+#ifndef __CUDACC__
 /** \cholesky_module
  * \returns the LLT decomposition of \c *this
+  * \sa SelfAdjointView::llt()
  */
 template<typename Derived>
 inline const LLT<typename MatrixBase<Derived>::PlainObject>
@@ -477,6 +468,7 @@ MatrixBase<Derived>::llt() const

 /** \cholesky_module
  * \returns the LLT decomposition of \c *this
+  * \sa SelfAdjointView::llt()
  */
 template<typename MatrixType, unsigned int UpLo>
 inline const LLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
@@ -484,7 +476,8 @@ SelfAdjointView<MatrixType, UpLo>::llt() const
 {
  return LLT<PlainObject,UpLo>(m_matrix);
 }
-
+#endif // __CUDACC__
+  
 } // end namespace Eigen

 #endif // EIGEN_LLT_H
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -58,10 +58,12 @@ cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_Index>& mat)
  res.p       = mat.outerIndexPtr();
  res.i       = mat.innerIndexPtr();
  res.x       = mat.valuePtr();
+  res.z       = 0;
  res.sorted  = 1;
  if(mat.isCompressed())
  {
    res.packed  = 1;
+    res.nz = 0;
  }
  else
  {
@@ -170,6 +172,7 @@ class CholmodBase : internal::noncopyable
    CholmodBase()
      : m_cholmodFactor(0), m_info(Success), m_isInitialized(false)
    {
+      m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0);
      cholmod_start(&m_cholmod);
    }

@@ -241,7 +244,7 @@ class CholmodBase : internal::noncopyable
      return internal::sparse_solve_retval<CholmodBase, Rhs>(*this, b.derived());
    }
    
-    /** Performs a symbolic decomposition on the sparcity of \a matrix.
+    /** Performs a symbolic decomposition on the sparsity pattern of \a matrix.
      *
      * This function is particularly useful when solving for several problems having the same structure.
      * 
@@ -265,7 +268,7 @@ class CholmodBase : internal::noncopyable
    
    /** Performs a numeric decomposition of \a matrix
      *
-      * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+      * The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been performed.
      *
      * \sa analyzePattern()
      */
@@ -302,7 +305,7 @@ class CholmodBase : internal::noncopyable
      {
        this->m_info = NumericalIssue;
      }
-      // TODO optimize this copy by swapping when possible (be carreful with alignment, etc.)
+      // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
      dest = Matrix<Scalar,Dest::RowsAtCompileTime,Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),b.rows(),b.cols());
      cholmod_free_dense(&x_cd, &m_cholmod);
    }
@@ -323,7 +326,7 @@ class CholmodBase : internal::noncopyable
      {
        this->m_info = NumericalIssue;
      }
-      // TODO optimize this copy by swapping when possible (be carreful with alignment, etc.)
+      // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
      dest = viewAsEigen<DestScalar,DestOptions,DestIndex>(*x_cs);
      cholmod_free_sparse(&x_cs, &m_cholmod);
    }
@@ -365,8 +368,8 @@ class CholmodBase : internal::noncopyable
  *
  * This class allows to solve for A.X = B sparse linear problems via a simplicial LL^T Cholesky factorization
  * using the Cholmod library.
-  * This simplicial variant is equivalent to Eigen's built-in SimplicialLLT class. Thefore, it has little practical interest.
-  * The sparse matrix A must be selfajoint and positive definite. The vectors or matrices
+  * This simplicial variant is equivalent to Eigen's built-in SimplicialLLT class. Therefore, it has little practical interest.
+  * The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices
  * X and B can be either dense or sparse.
  *
  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
@@ -412,8 +415,8 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl
  *
  * This class allows to solve for A.X = B sparse linear problems via a simplicial LDL^T Cholesky factorization
  * using the Cholmod library.
-  * This simplicial variant is equivalent to Eigen's built-in SimplicialLDLT class. Thefore, it has little practical interest.
-  * The sparse matrix A must be selfajoint and positive definite. The vectors or matrices
+  * This simplicial variant is equivalent to Eigen's built-in SimplicialLDLT class. Therefore, it has little practical interest.
+  * The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices
  * X and B can be either dense or sparse.
  *
  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
@@ -458,7 +461,7 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp
  * This class allows to solve for A.X = B sparse linear problems via a supernodal LL^T Cholesky factorization
  * using the Cholmod library.
  * This supernodal variant performs best on dense enough problems, e.g., 3D FEM, or very high order 2D FEM.
-  * The sparse matrix A must be selfajoint and positive definite. The vectors or matrices
+  * The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices
  * X and B can be either dense or sparse.
  *
  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
@@ -501,7 +504,7 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
  * \brief A general Cholesky factorization and solver based on Cholmod
  *
  * This class allows to solve for A.X = B sparse linear problems via a LL^T or LDL^T Cholesky factorization
-  * using the Cholmod library. The sparse matrix A must be selfajoint and positive definite. The vectors or matrices
+  * using the Cholmod library. The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices
  * X and B can be either dense or sparse.
  *
  * This variant permits to change the underlying Cholesky method at runtime.
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -69,6 +69,7 @@ class Array
      * the usage of 'using'. This should be done only for operator=.
      */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array& operator=(const EigenBase<OtherDerived> &other)
    {
      return Base::operator=(other);
@@ -84,6 +85,7 @@ class Array
      * remain row-vectors and vectors remain vectors.
      */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array& operator=(const ArrayBase<OtherDerived>& other)
    {
      return Base::_set(other);
@@ -92,6 +94,7 @@ class Array
    /** This is a special case of the templated operator=. Its purpose is to
      * prevent a default operator= from hiding the templated operator=.
      */
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array& operator=(const Array& other)
    {
      return Base::_set(other);
@@ -107,6 +110,7 @@ class Array
      *
      * \sa resize(Index,Index)
      */
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array() : Base()
    {
      Base::_check_template_params();
@@ -116,6 +120,7 @@ class Array
 #ifndef EIGEN_PARSED_BY_DOXYGEN
    // FIXME is it still needed ??
    /** \internal */
+    EIGEN_DEVICE_FUNC
    Array(internal::constructor_without_unaligned_array_assert)
      : Base(internal::constructor_without_unaligned_array_assert())
    {
@@ -124,41 +129,63 @@ class Array
    }
 #endif

-    /** Constructs a vector or row-vector with given dimension. \only_for_vectors
-      *
-      * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
-      * it is redundant to pass the dimension here, so it makes more sense to use the default
-      * constructor Matrix() instead.
-      */
-    EIGEN_STRONG_INLINE explicit Array(Index dim)
-      : Base(dim, RowsAtCompileTime == 1 ? 1 : dim, ColsAtCompileTime == 1 ? 1 : dim)
+#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+    Array(Array&& other)
+      : Base(std::move(other))
    {
      Base::_check_template_params();
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Array)
-      eigen_assert(dim >= 0);
-      eigen_assert(SizeAtCompileTime == Dynamic || SizeAtCompileTime == dim);
-      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+      if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
+        Base::_set_noalias(other);
    }
+    Array& operator=(Array&& other)
+    {
+      other.swap(*this);
+      return *this;
+    }
+#endif
+

    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE explicit Array(const T& x)
+    {
+      Base::_check_template_params();
+      Base::template _init1<T>(x);
+    }
+
    template<typename T0, typename T1>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const T0& val0, const T1& val1)
    {
      Base::_check_template_params();
      this->template _init2<T0,T1>(val0, val1);
    }
    #else
-    /** constructs an uninitialized matrix with \a rows rows and \a cols columns.
+    /** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */
+    EIGEN_DEVICE_FUNC explicit Array(const Scalar *data);
+    /** Constructs a vector or row-vector with given dimension. \only_for_vectors
      *
-      * This is useful for dynamic-size matrices. For fixed-size matrices,
+      * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
+      * it is redundant to pass the dimension here, so it makes more sense to use the default
+      * constructor Array() instead.
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE explicit Array(Index dim);
+    /** constructs an initialized 1x1 Array with the given coefficient */
+    Array(const Scalar& value);
+    /** constructs an uninitialized array with \a rows rows and \a cols columns.
+      *
+      * This is useful for dynamic-size arrays. For fixed-size arrays,
      * it is redundant to pass these parameters, so one should use the default constructor
-      * Matrix() instead. */
+      * Array() instead. */
    Array(Index rows, Index cols);
    /** constructs an initialized 2D vector with given coefficients */
    Array(const Scalar& val0, const Scalar& val1);
    #endif

    /** constructs an initialized 3D vector with given coefficients */
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2)
    {
      Base::_check_template_params();
@@ -168,6 +195,7 @@ class Array
      m_storage.data()[2] = val2;
    }
    /** constructs an initialized 4D vector with given coefficients */
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3)
    {
      Base::_check_template_params();
@@ -178,10 +206,9 @@ class Array
      m_storage.data()[3] = val3;
    }

-    explicit Array(const Scalar *data);
-
    /** Constructor copying the value of the expression \a other */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const ArrayBase<OtherDerived>& other)
             : Base(other.rows() * other.cols(), other.rows(), other.cols())
    {
@@ -189,6 +216,7 @@ class Array
      Base::_set_noalias(other);
    }
    /** Copy constructor */
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const Array& other)
            : Base(other.rows() * other.cols(), other.rows(), other.cols())
    {
@@ -197,6 +225,7 @@ class Array
    }
    /** Copy constructor with in-place evaluation */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const ReturnByValue<OtherDerived>& other)
    {
      Base::_check_template_params();
@@ -206,11 +235,12 @@ class Array

    /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other)
      : Base(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
    {
      Base::_check_template_params();
-      Base::resize(other.rows(), other.cols());
+      Base::_resize_to_match(other);
      *this = other;
    }

@@ -221,8 +251,8 @@ class Array
    void swap(ArrayBase<OtherDerived> const & other)
    { this->_swap(other.derived()); }

-    inline Index innerStride() const { return 1; }
-    inline Index outerStride() const { return this->innerSize(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }

    #ifdef EIGEN_ARRAY_PLUGIN
    #include EIGEN_ARRAY_PLUGIN
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@@ -118,40 +118,50 @@ template<typename Derived> class ArrayBase
    /** Special case of the template operator=, in order to prevent the compiler
      * from generating a default operator= (issue hit with g++ 4.1)
      */
+    EIGEN_DEVICE_FUNC
    Derived& operator=(const ArrayBase& other)
    {
      return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
    }

-    Derived& operator+=(const Scalar& scalar)
-    { return *this = derived() + scalar; }
-    Derived& operator-=(const Scalar& scalar)
-    { return *this = derived() - scalar; }
+    EIGEN_DEVICE_FUNC
+    Derived& operator+=(const Scalar& scalar);
+    EIGEN_DEVICE_FUNC
+    Derived& operator-=(const Scalar& scalar);

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    Derived& operator+=(const ArrayBase<OtherDerived>& other);
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    Derived& operator-=(const ArrayBase<OtherDerived>& other);

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    Derived& operator*=(const ArrayBase<OtherDerived>& other);

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    Derived& operator/=(const ArrayBase<OtherDerived>& other);

  public:
+    EIGEN_DEVICE_FUNC
    ArrayBase<Derived>& array() { return *this; }
+    EIGEN_DEVICE_FUNC
    const ArrayBase<Derived>& array() const { return *this; }

    /** \returns an \link Eigen::MatrixBase Matrix \endlink expression of this array
      * \sa MatrixBase::array() */
+    EIGEN_DEVICE_FUNC
    MatrixWrapper<Derived> matrix() { return derived(); }
+    EIGEN_DEVICE_FUNC
    const MatrixWrapper<const Derived> matrix() const { return derived(); }

 //     template<typename Dest>
 //     inline void evalTo(Dest& dst) const { dst = matrix(); }

  protected:
+    EIGEN_DEVICE_FUNC
    ArrayBase() : Base() {}

  private:
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@@ -29,6 +29,11 @@ struct traits<ArrayWrapper<ExpressionType> >
  : public traits<typename remove_all<typename ExpressionType::Nested>::type >
 {
  typedef ArrayXpr XprKind;
+  // Let's remove NestByRefBit
+  enum {
+    Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
+    Flags = Flags0 & ~NestByRefBit
+  };
 };
 }

@@ -48,41 +53,54 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >

    typedef typename internal::nested<ExpressionType>::type NestedExpressionType;

-    inline ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}

+    EIGEN_DEVICE_FUNC
    inline Index rows() const { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC
    inline Index cols() const { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC
    inline Index outerStride() const { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC
    inline Index innerStride() const { return m_expression.innerStride(); }

+    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
+    EIGEN_DEVICE_FUNC
    inline const Scalar* data() const { return m_expression.data(); }

+    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index rowId, Index colId) const
    {
      return m_expression.coeff(rowId, colId);
    }

+    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index rowId, Index colId)
    {
      return m_expression.const_cast_derived().coeffRef(rowId, colId);
    }

+    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
      return m_expression.const_cast_derived().coeffRef(rowId, colId);
    }

+    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index index) const
    {
      return m_expression.coeff(index);
    }

+    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index index)
    {
      return m_expression.const_cast_derived().coeffRef(index);
    }

+    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
      return m_expression.const_cast_derived().coeffRef(index);
@@ -113,9 +131,11 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
    }

    template<typename Dest>
+    EIGEN_DEVICE_FUNC
    inline void evalTo(Dest& dst) const { dst = m_expression; }

    const typename internal::remove_all<NestedExpressionType>::type& 
+    EIGEN_DEVICE_FUNC
    nestedExpression() const 
    {
      return m_expression;
@@ -123,9 +143,11 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >

    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index)  */
+    EIGEN_DEVICE_FUNC
    void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index,Index)*/
+    EIGEN_DEVICE_FUNC
    void resize(Index nbRows, Index nbCols) { m_expression.const_cast_derived().resize(nbRows,nbCols); }

  protected:
@@ -149,6 +171,11 @@ struct traits<MatrixWrapper<ExpressionType> >
 : public traits<typename remove_all<typename ExpressionType::Nested>::type >
 {
  typedef MatrixXpr XprKind;
+  // Let's remove NestByRefBit
+  enum {
+    Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
+    Flags = Flags0 & ~NestByRefBit
+  };
 };
 }

@@ -168,41 +195,54 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >

    typedef typename internal::nested<ExpressionType>::type NestedExpressionType;

+    EIGEN_DEVICE_FUNC
    inline MatrixWrapper(ExpressionType& a_matrix) : m_expression(a_matrix) {}

+    EIGEN_DEVICE_FUNC
    inline Index rows() const { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC
    inline Index cols() const { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC
    inline Index outerStride() const { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC
    inline Index innerStride() const { return m_expression.innerStride(); }

+    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
+    EIGEN_DEVICE_FUNC
    inline const Scalar* data() const { return m_expression.data(); }

+    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index rowId, Index colId) const
    {
      return m_expression.coeff(rowId, colId);
    }

+    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index rowId, Index colId)
    {
      return m_expression.const_cast_derived().coeffRef(rowId, colId);
    }

+    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
      return m_expression.derived().coeffRef(rowId, colId);
    }

+    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index index) const
    {
      return m_expression.coeff(index);
    }

+    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index index)
    {
      return m_expression.const_cast_derived().coeffRef(index);
    }

+    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
      return m_expression.const_cast_derived().coeffRef(index);
@@ -232,6 +272,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
      m_expression.const_cast_derived().template writePacket<LoadMode>(index, val);
    }

+    EIGEN_DEVICE_FUNC
    const typename internal::remove_all<NestedExpressionType>::type& 
    nestedExpression() const 
    {
@@ -240,9 +281,11 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >

    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index)  */
+    EIGEN_DEVICE_FUNC
    void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index,Index)*/
+    EIGEN_DEVICE_FUNC
    void resize(Index nbRows, Index nbCols) { m_expression.const_cast_derived().resize(nbRows,nbCols); }

  protected:
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -105,6 +105,8 @@ public:
    EIGEN_DEBUG_VAR(DstIsAligned)
    EIGEN_DEBUG_VAR(SrcIsAligned)
    EIGEN_DEBUG_VAR(JointAlignment)
+    EIGEN_DEBUG_VAR(Derived::SizeAtCompileTime)
+    EIGEN_DEBUG_VAR(OtherDerived::CoeffReadCost)
    EIGEN_DEBUG_VAR(InnerSize)
    EIGEN_DEBUG_VAR(InnerMaxSize)
    EIGEN_DEBUG_VAR(PacketSize)
@@ -139,6 +141,7 @@ struct assign_DefaultTraversal_CompleteUnrolling
    inner = Index % Derived1::InnerSizeAtCompileTime
  };

+  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
  {
    dst.copyCoeffByOuterInner(outer, inner, src);
@@ -149,12 +152,14 @@ struct assign_DefaultTraversal_CompleteUnrolling
 template<typename Derived1, typename Derived2, int Stop>
 struct assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
 {
+  EIGEN_DEVICE_FUNC 
  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
 };

 template<typename Derived1, typename Derived2, int Index, int Stop>
 struct assign_DefaultTraversal_InnerUnrolling
 {
+  EIGEN_DEVICE_FUNC 
  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src, typename Derived1::Index outer)
  {
    dst.copyCoeffByOuterInner(outer, Index, src);
@@ -165,6 +170,7 @@ struct assign_DefaultTraversal_InnerUnrolling
 template<typename Derived1, typename Derived2, int Stop>
 struct assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Stop, Stop>
 {
+  EIGEN_DEVICE_FUNC 
  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &, typename Derived1::Index) {}
 };

@@ -175,6 +181,7 @@ struct assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Stop, Stop>
 template<typename Derived1, typename Derived2, int Index, int Stop>
 struct assign_LinearTraversal_CompleteUnrolling
 {
+  EIGEN_DEVICE_FUNC 
  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
  {
    dst.copyCoeff(Index, src);
@@ -185,6 +192,7 @@ struct assign_LinearTraversal_CompleteUnrolling
 template<typename Derived1, typename Derived2, int Stop>
 struct assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
 {
+  EIGEN_DEVICE_FUNC 
  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
 };

@@ -249,6 +257,7 @@ struct assign_impl;
 template<typename Derived1, typename Derived2, int Unrolling, int Version>
 struct assign_impl<Derived1, Derived2, InvalidTraversal, Unrolling, Version>
 {
+  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &, const Derived2 &) { }
 };

@@ -256,6 +265,7 @@ template<typename Derived1, typename Derived2, int Version>
 struct assign_impl<Derived1, Derived2, DefaultTraversal, NoUnrolling, Version>
 {
  typedef typename Derived1::Index Index;
+  EIGEN_DEVICE_FUNC 
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    const Index innerSize = dst.innerSize();
@@ -269,6 +279,7 @@ struct assign_impl<Derived1, Derived2, DefaultTraversal, NoUnrolling, Version>
 template<typename Derived1, typename Derived2, int Version>
 struct assign_impl<Derived1, Derived2, DefaultTraversal, CompleteUnrolling, Version>
 {
+  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
  {
    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
@@ -280,6 +291,7 @@ template<typename Derived1, typename Derived2, int Version>
 struct assign_impl<Derived1, Derived2, DefaultTraversal, InnerUnrolling, Version>
 {
  typedef typename Derived1::Index Index;
+  EIGEN_DEVICE_FUNC 
  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
  {
    const Index outerSize = dst.outerSize();
@@ -297,6 +309,7 @@ template<typename Derived1, typename Derived2, int Version>
 struct assign_impl<Derived1, Derived2, LinearTraversal, NoUnrolling, Version>
 {
  typedef typename Derived1::Index Index;
+  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    const Index size = dst.size();
@@ -308,6 +321,7 @@ struct assign_impl<Derived1, Derived2, LinearTraversal, NoUnrolling, Version>
 template<typename Derived1, typename Derived2, int Version>
 struct assign_impl<Derived1, Derived2, LinearTraversal, CompleteUnrolling, Version>
 {
+  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
  {
    assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
@@ -492,12 +506,25 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived,OtherDerived)
  EIGEN_STATIC_ASSERT(SameType,YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)

+#ifdef EIGEN_TEST_EVALUATORS
+  
+#ifdef EIGEN_DEBUG_ASSIGN
+  internal::copy_using_evaluator_traits<Derived, OtherDerived>::debug();
+#endif
+  eigen_assert(rows() == other.rows() && cols() == other.cols());
+  internal::call_dense_assignment_loop(derived(),other.derived());
+  
+#else // EIGEN_TEST_EVALUATORS
+
 #ifdef EIGEN_DEBUG_ASSIGN
  internal::assign_traits<Derived, OtherDerived>::debug();
 #endif
  eigen_assert(rows() == other.rows() && cols() == other.cols());
  internal::assign_impl<Derived, OtherDerived, int(SameType) ? int(internal::assign_traits<Derived, OtherDerived>::Traversal)
-                                                       : int(InvalidTraversal)>::run(derived(),other.derived());
+                                                             : int(InvalidTraversal)>::run(derived(),other.derived());
+  
+#endif // EIGEN_TEST_EVALUATORS
+  
 #ifndef EIGEN_NO_DEBUG
  checkTransposeAliasing(other.derived());
 #endif
@@ -517,22 +544,28 @@ struct assign_selector;

 template<typename Derived, typename OtherDerived>
 struct assign_selector<Derived,OtherDerived,false,false> {
+  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.derived()); }
  template<typename ActualDerived, typename ActualOtherDerived>
+  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Derived& evalTo(ActualDerived& dst, const ActualOtherDerived& other) { other.evalTo(dst); return dst; }
 };
 template<typename Derived, typename OtherDerived>
 struct assign_selector<Derived,OtherDerived,true,false> {
+  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.eval()); }
 };
 template<typename Derived, typename OtherDerived>
 struct assign_selector<Derived,OtherDerived,false,true> {
+  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose()); }
  template<typename ActualDerived, typename ActualOtherDerived>
+  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Derived& evalTo(ActualDerived& dst, const ActualOtherDerived& other) { Transpose<ActualDerived> dstTrans(dst); other.evalTo(dstTrans); return dst; }
 };
 template<typename Derived, typename OtherDerived>
 struct assign_selector<Derived,OtherDerived,true,true> {
+  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose().eval()); }
 };

@@ -540,18 +573,21 @@ struct assign_selector<Derived,OtherDerived,true,true> {

 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase<OtherDerived>& other)
 {
  return internal::assign_selector<Derived,OtherDerived>::run(derived(), other.derived());
 }

 template<typename Derived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase& other)
 {
  return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
 }

 template<typename Derived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const MatrixBase& other)
 {
  return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
@@ -559,6 +595,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const MatrixBase& ot

 template<typename Derived>
 template <typename OtherDerived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const DenseBase<OtherDerived>& other)
 {
  return internal::assign_selector<Derived,OtherDerived>::run(derived(), other.derived());
@@ -566,6 +603,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const DenseBase<Othe

 template<typename Derived>
 template <typename OtherDerived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const EigenBase<OtherDerived>& other)
 {
  return internal::assign_selector<Derived,OtherDerived,false>::evalTo(derived(), other.derived());
@@ -573,6 +611,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const EigenBase<Othe

 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
 {
  return internal::assign_selector<Derived,OtherDerived,false>::evalTo(derived(), other.derived());
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -0,0 +1,842 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2011-2013 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ASSIGN_EVALUATOR_H
+#define EIGEN_ASSIGN_EVALUATOR_H
+
+namespace Eigen {
+
+// This implementation is based on Assign.h
+
+namespace internal {
+  
+/***************************************************************************
+* Part 1 : the logic deciding a strategy for traversal and unrolling       *
+***************************************************************************/
+
+// copy_using_evaluator_traits is based on assign_traits
+
+template <typename Derived, typename OtherDerived>
+struct copy_using_evaluator_traits
+{
+public:
+  enum {
+    DstIsAligned = Derived::Flags & AlignedBit,
+    DstHasDirectAccess = Derived::Flags & DirectAccessBit,
+    SrcIsAligned = OtherDerived::Flags & AlignedBit,
+    JointAlignment = bool(DstIsAligned) && bool(SrcIsAligned) ? Aligned : Unaligned,
+    SrcEvalBeforeAssign = (evaluator_traits<OtherDerived>::HasEvalTo == 1)
+  };
+
+private:
+  enum {
+    InnerSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::SizeAtCompileTime)
+              : int(Derived::Flags)&RowMajorBit ? int(Derived::ColsAtCompileTime)
+              : int(Derived::RowsAtCompileTime),
+    InnerMaxSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::MaxSizeAtCompileTime)
+              : int(Derived::Flags)&RowMajorBit ? int(Derived::MaxColsAtCompileTime)
+              : int(Derived::MaxRowsAtCompileTime),
+    MaxSizeAtCompileTime = Derived::SizeAtCompileTime,
+    PacketSize = packet_traits<typename Derived::Scalar>::size
+  };
+
+  enum {
+    StorageOrdersAgree = (int(Derived::IsRowMajor) == int(OtherDerived::IsRowMajor)),
+    MightVectorize = StorageOrdersAgree
+                  && (int(Derived::Flags) & int(OtherDerived::Flags) & ActualPacketAccessBit),
+    MayInnerVectorize  = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
+                       && int(DstIsAligned) && int(SrcIsAligned),
+    MayLinearize = StorageOrdersAgree && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
+    MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
+                       && (DstIsAligned || MaxSizeAtCompileTime == Dynamic),
+      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
+         so it's only good for large enough sizes. */
+    MaySliceVectorize  = MightVectorize && DstHasDirectAccess
+                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*PacketSize)
+      /* slice vectorization can be slow, so we only want it if the slices are big, which is
+         indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
+         in a fixed-size matrix */
+  };
+
+public:
+  enum {
+    Traversal = int(SrcEvalBeforeAssign) ? int(AllAtOnceTraversal) 
+              : int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
+              : int(MayLinearVectorize)  ? int(LinearVectorizedTraversal)
+              : int(MaySliceVectorize)   ? int(SliceVectorizedTraversal)
+              : int(MayLinearize)        ? int(LinearTraversal)
+                                         : int(DefaultTraversal),
+    Vectorized = int(Traversal) == InnerVectorizedTraversal
+              || int(Traversal) == LinearVectorizedTraversal
+              || int(Traversal) == SliceVectorizedTraversal
+  };
+
+private:
+  enum {
+    UnrollingLimit      = EIGEN_UNROLLING_LIMIT * (Vectorized ? int(PacketSize) : 1),
+    MayUnrollCompletely = int(Derived::SizeAtCompileTime) != Dynamic
+                       && int(OtherDerived::CoeffReadCost) != Dynamic
+                       && int(Derived::SizeAtCompileTime) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit),
+    MayUnrollInner      = int(InnerSize) != Dynamic
+                       && int(OtherDerived::CoeffReadCost) != Dynamic
+                       && int(InnerSize) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit)
+  };
+
+public:
+  enum {
+    Unrolling = (int(Traversal) == int(InnerVectorizedTraversal) || int(Traversal) == int(DefaultTraversal))
+                ? (
+                    int(MayUnrollCompletely) ? int(CompleteUnrolling)
+                  : int(MayUnrollInner)      ? int(InnerUnrolling)
+                                             : int(NoUnrolling)
+                  )
+              : int(Traversal) == int(LinearVectorizedTraversal)
+                ? ( bool(MayUnrollCompletely) && bool(DstIsAligned) ? int(CompleteUnrolling) 
+                                                                    : int(NoUnrolling) )
+              : int(Traversal) == int(LinearTraversal)
+                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) 
+                                              : int(NoUnrolling) )
+              : int(NoUnrolling)
+  };
+
+#ifdef EIGEN_DEBUG_ASSIGN
+  static void debug()
+  {
+    EIGEN_DEBUG_VAR(DstIsAligned)
+    EIGEN_DEBUG_VAR(SrcIsAligned)
+    EIGEN_DEBUG_VAR(JointAlignment)
+    EIGEN_DEBUG_VAR(InnerSize)
+    EIGEN_DEBUG_VAR(InnerMaxSize)
+    EIGEN_DEBUG_VAR(PacketSize)
+    EIGEN_DEBUG_VAR(StorageOrdersAgree)
+    EIGEN_DEBUG_VAR(MightVectorize)
+    EIGEN_DEBUG_VAR(MayLinearize)
+    EIGEN_DEBUG_VAR(MayInnerVectorize)
+    EIGEN_DEBUG_VAR(MayLinearVectorize)
+    EIGEN_DEBUG_VAR(MaySliceVectorize)
+    EIGEN_DEBUG_VAR(Traversal)
+    EIGEN_DEBUG_VAR(UnrollingLimit)
+    EIGEN_DEBUG_VAR(MayUnrollCompletely)
+    EIGEN_DEBUG_VAR(MayUnrollInner)
+    EIGEN_DEBUG_VAR(Unrolling)
+  }
+#endif
+};
+
+/***************************************************************************
+* Part 2 : meta-unrollers
+***************************************************************************/
+
+/************************
+*** Default traversal ***
+************************/
+
+template<typename Kernel, int Index, int Stop>
+struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling
+{
+  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
+  typedef typename DstEvaluatorType::XprType DstXprType;
+  
+  enum {
+    outer = Index / DstXprType::InnerSizeAtCompileTime,
+    inner = Index % DstXprType::InnerSizeAtCompileTime
+  };
+
+  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    kernel.assignCoeffByOuterInner(outer, inner);
+    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
+  }
+};
+
+template<typename Kernel, int Stop>
+struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Stop, Stop>
+{
+  static EIGEN_STRONG_INLINE void run(Kernel&) { }
+};
+
+template<typename Kernel, int Index, int Stop>
+struct copy_using_evaluator_DefaultTraversal_InnerUnrolling
+{
+  static EIGEN_STRONG_INLINE void run(Kernel &kernel, int outer)
+  {
+    kernel.assignCoeffByOuterInner(outer, Index);
+    copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Index+1, Stop>::run(kernel, outer);
+  }
+};
+
+template<typename Kernel, int Stop>
+struct copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Stop, Stop>
+{
+  static EIGEN_STRONG_INLINE void run(Kernel&, int) { }
+};
+
+/***********************
+*** Linear traversal ***
+***********************/
+
+template<typename Kernel, int Index, int Stop>
+struct copy_using_evaluator_LinearTraversal_CompleteUnrolling
+{
+  static EIGEN_STRONG_INLINE void run(Kernel& kernel)
+  {
+    kernel.assignCoeff(Index);
+    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
+  }
+};
+
+template<typename Kernel, int Stop>
+struct copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Stop, Stop>
+{
+  static EIGEN_STRONG_INLINE void run(Kernel&) { }
+};
+
+/**************************
+*** Inner vectorization ***
+**************************/
+
+template<typename Kernel, int Index, int Stop>
+struct copy_using_evaluator_innervec_CompleteUnrolling
+{
+  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
+  typedef typename DstEvaluatorType::XprType DstXprType;
+
+  enum {
+    outer = Index / DstXprType::InnerSizeAtCompileTime,
+    inner = Index % DstXprType::InnerSizeAtCompileTime,
+    JointAlignment = Kernel::AssignmentTraits::JointAlignment
+  };
+
+  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    kernel.template assignPacketByOuterInner<Aligned, JointAlignment>(outer, inner);
+    enum { NextIndex = Index + packet_traits<typename DstXprType::Scalar>::size };
+    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel);
+  }
+};
+
+template<typename Kernel, int Stop>
+struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop>
+{
+  static EIGEN_STRONG_INLINE void run(Kernel&) { }
+};
+
+template<typename Kernel, int Index, int Stop>
+struct copy_using_evaluator_innervec_InnerUnrolling
+{
+  static EIGEN_STRONG_INLINE void run(Kernel &kernel, int outer)
+  {
+    kernel.template assignPacketByOuterInner<Aligned, Aligned>(outer, Index);
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    enum { NextIndex = Index + packet_traits<typename DstXprType::Scalar>::size };
+    copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop>::run(kernel, outer);
+  }
+};
+
+template<typename Kernel, int Stop>
+struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop>
+{
+  static EIGEN_STRONG_INLINE void run(Kernel &, int) { }
+};
+
+/***************************************************************************
+* Part 3 : implementation of all cases
+***************************************************************************/
+
+// dense_assignment_loop is based on assign_impl
+
+template<typename Kernel,
+         int Traversal = Kernel::AssignmentTraits::Traversal,
+         int Unrolling = Kernel::AssignmentTraits::Unrolling>
+struct dense_assignment_loop;
+
+/************************
+*** Default traversal ***
+************************/
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>
+{
+  static void run(Kernel &kernel)
+  {
+    typedef typename Kernel::Index Index;
+    
+    for(Index outer = 0; outer < kernel.outerSize(); ++outer) {
+      for(Index inner = 0; inner < kernel.innerSize(); ++inner) {
+        kernel.assignCoeffByOuterInner(outer, inner);
+      }
+    }
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, DefaultTraversal, CompleteUnrolling>
+{
+  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, DefaultTraversal, InnerUnrolling>
+{
+  typedef typename Kernel::Index Index;
+  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+
+    const Index outerSize = kernel.outerSize();
+    for(Index outer = 0; outer < outerSize; ++outer)
+      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel, outer);
+  }
+};
+
+/***************************
+*** Linear vectorization ***
+***************************/
+
+
+// The goal of unaligned_dense_assignment_loop is simply to factorize the handling
+// of the non vectorizable beginning and ending parts
+
+template <bool IsAligned = false>
+struct unaligned_dense_assignment_loop
+{
+  // if IsAligned = true, then do nothing
+  template <typename Kernel>
+  static EIGEN_STRONG_INLINE void run(Kernel&, typename Kernel::Index, typename Kernel::Index) {}
+};
+
+template <>
+struct unaligned_dense_assignment_loop<false>
+{
+  // MSVC must not inline this functions. If it does, it fails to optimize the
+  // packet access path.
+  // FIXME check which version exhibits this issue
+#ifdef _MSC_VER
+  template <typename Kernel>
+  static EIGEN_DONT_INLINE void run(Kernel &kernel,
+                                    typename Kernel::Index start,
+                                    typename Kernel::Index end)
+#else
+  template <typename Kernel>
+  static EIGEN_STRONG_INLINE void run(Kernel &kernel,
+                                      typename Kernel::Index start,
+                                      typename Kernel::Index end)
+#endif
+  {
+    for (typename Kernel::Index index = start; index < end; ++index)
+      kernel.assignCoeff(index);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
+{
+  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::Index Index;
+
+    const Index size = kernel.size();
+    typedef packet_traits<typename Kernel::Scalar> PacketTraits;
+    enum {
+      packetSize = PacketTraits::size,
+      dstIsAligned = int(Kernel::AssignmentTraits::DstIsAligned),
+      dstAlignment = PacketTraits::AlignedOnScalar ? Aligned : dstIsAligned,
+      srcAlignment = Kernel::AssignmentTraits::JointAlignment
+    };
+    const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned(&kernel.dstEvaluator().coeffRef(0), size);
+    const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
+
+    unaligned_dense_assignment_loop<dstIsAligned!=0>::run(kernel, 0, alignedStart);
+
+    for(Index index = alignedStart; index < alignedEnd; index += packetSize)
+      kernel.template assignPacket<dstAlignment, srcAlignment>(index);
+
+    unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrolling>
+{
+  typedef typename Kernel::Index Index;
+  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    
+    enum { size = DstXprType::SizeAtCompileTime,
+           packetSize = packet_traits<typename Kernel::Scalar>::size,
+           alignedSize = (size/packetSize)*packetSize };
+
+    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, alignedSize>::run(kernel);
+    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, alignedSize, size>::run(kernel);
+  }
+};
+
+/**************************
+*** Inner vectorization ***
+**************************/
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling>
+{
+  static inline void run(Kernel &kernel)
+  {
+    typedef typename Kernel::Index Index;
+
+    const Index innerSize = kernel.innerSize();
+    const Index outerSize = kernel.outerSize();
+    const Index packetSize = packet_traits<typename Kernel::Scalar>::size;
+    for(Index outer = 0; outer < outerSize; ++outer)
+      for(Index inner = 0; inner < innerSize; inner+=packetSize)
+        kernel.template assignPacketByOuterInner<Aligned, Aligned>(outer, inner);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, CompleteUnrolling>
+{
+  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling>
+{
+  typedef typename Kernel::Index Index;
+  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    const Index outerSize = kernel.outerSize();
+    for(Index outer = 0; outer < outerSize; ++outer)
+      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel, outer);
+  }
+};
+
+/***********************
+*** Linear traversal ***
+***********************/
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, LinearTraversal, NoUnrolling>
+{
+  static inline void run(Kernel &kernel)
+  {
+    typedef typename Kernel::Index Index;
+    const Index size = kernel.size();
+    for(Index i = 0; i < size; ++i)
+      kernel.assignCoeff(i);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling>
+{
+  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
+  }
+};
+
+/**************************
+*** Slice vectorization ***
+***************************/
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
+{
+  static inline void run(Kernel &kernel)
+  {
+    typedef typename Kernel::Index Index;
+    typedef packet_traits<typename Kernel::Scalar> PacketTraits;
+    enum {
+      packetSize = PacketTraits::size,
+      alignable = PacketTraits::AlignedOnScalar,
+      dstAlignment = alignable ? Aligned : int(Kernel::AssignmentTraits::DstIsAligned)
+    };
+    const Index packetAlignedMask = packetSize - 1;
+    const Index innerSize = kernel.innerSize();
+    const Index outerSize = kernel.outerSize();
+    const Index alignedStep = alignable ? (packetSize - kernel.outerStride() % packetSize) & packetAlignedMask : 0;
+    Index alignedStart = ((!alignable) || Kernel::AssignmentTraits::DstIsAligned) ? 0
+                       : internal::first_aligned(&kernel.dstEvaluator().coeffRef(0,0), innerSize);
+
+    for(Index outer = 0; outer < outerSize; ++outer)
+    {
+      const Index alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask);
+      // do the non-vectorizable part of the assignment
+      for(Index inner = 0; inner<alignedStart ; ++inner)
+        kernel.assignCoeffByOuterInner(outer, inner);
+
+      // do the vectorizable part of the assignment
+      for(Index inner = alignedStart; inner<alignedEnd; inner+=packetSize)
+        kernel.template assignPacketByOuterInner<dstAlignment, Unaligned>(outer, inner);
+
+      // do the non-vectorizable part of the assignment
+      for(Index inner = alignedEnd; inner<innerSize ; ++inner)
+        kernel.assignCoeffByOuterInner(outer, inner);
+
+      alignedStart = std::min<Index>((alignedStart+alignedStep)%packetSize, innerSize);
+    }
+  }
+};
+
+/****************************
+*** All-at-once traversal ***
+****************************/
+
+// TODO: this 'AllAtOnceTraversal' should be dropped or caught earlier (Gael)
+// Indeed, what to do with the kernel's functor??
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, AllAtOnceTraversal, NoUnrolling>
+{
+  static inline void run(Kernel & kernel)
+  {
+    // Evaluate rhs in temporary to prevent aliasing problems in a = a * a;
+    // TODO: Do not pass the xpr object to evalTo() (Jitse)
+    kernel.srcEvaluator().evalTo(kernel.dstEvaluator(), kernel.dstExpression());
+  }
+};
+
+/***************************************************************************
+* Part 4 : Generic Assignment routine
+***************************************************************************/
+
+// This class generalize the assignment of a coefficient (or packet) from one dense evaluator
+// to another dense writable evaluator.
+// It is parametrized by the two evaluators, and the actual assignment functor.
+// This abstraction level permits to keep the evaluation loops as simple and as generic as possible.
+// One can customize the assignment using this generic dense_assignment_kernel with different
+// functors, or by completely overloading it, by-passing a functor.
+template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor>
+class generic_dense_assignment_kernel
+{
+protected:
+  typedef typename DstEvaluatorTypeT::XprType DstXprType;
+  typedef typename SrcEvaluatorTypeT::XprType SrcXprType;
+public:
+  
+  typedef DstEvaluatorTypeT DstEvaluatorType;
+  typedef SrcEvaluatorTypeT SrcEvaluatorType;
+  typedef typename DstEvaluatorType::Scalar Scalar;
+  typedef typename DstEvaluatorType::Index Index;
+  typedef copy_using_evaluator_traits<DstXprType, SrcXprType> AssignmentTraits;
+  
+  
+  generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+    : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr)
+  {}
+  
+  Index size() const        { return m_dstExpr.size(); }
+  Index innerSize() const   { return m_dstExpr.innerSize(); }
+  Index outerSize() const   { return m_dstExpr.outerSize(); }
+  Index outerStride() const { return m_dstExpr.outerStride(); }
+  
+  // TODO get rid of this one:
+  DstXprType& dstExpression() const { return m_dstExpr; }
+  
+  DstEvaluatorType& dstEvaluator() { return m_dst; }
+  const SrcEvaluatorType& srcEvaluator() const { return m_src; }
+  
+  void assignCoeff(Index row, Index col)
+  {
+    m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col));
+  }
+  
+  void assignCoeff(Index index)
+  {
+    m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index));
+  }
+  
+  void assignCoeffByOuterInner(Index outer, Index inner)
+  {
+    Index row = rowIndexByOuterInner(outer, inner); 
+    Index col = colIndexByOuterInner(outer, inner); 
+    assignCoeff(row, col);
+  }
+  
+  
+  template<int StoreMode, int LoadMode>
+  void assignPacket(Index row, Index col)
+  {
+    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode>(row,col));
+  }
+  
+  template<int StoreMode, int LoadMode>
+  void assignPacket(Index index)
+  {
+    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode>(index));
+  }
+  
+  template<int StoreMode, int LoadMode>
+  void assignPacketByOuterInner(Index outer, Index inner)
+  {
+    Index row = rowIndexByOuterInner(outer, inner); 
+    Index col = colIndexByOuterInner(outer, inner);
+    assignPacket<StoreMode,LoadMode>(row, col);
+  }
+  
+  static Index rowIndexByOuterInner(Index outer, Index inner)
+  {
+    typedef typename DstEvaluatorType::ExpressionTraits Traits;
+    return int(Traits::RowsAtCompileTime) == 1 ? 0
+      : int(Traits::ColsAtCompileTime) == 1 ? inner
+      : int(Traits::Flags)&RowMajorBit ? outer
+      : inner;
+  }
+
+  static Index colIndexByOuterInner(Index outer, Index inner)
+  {
+    typedef typename DstEvaluatorType::ExpressionTraits Traits;
+    return int(Traits::ColsAtCompileTime) == 1 ? 0
+      : int(Traits::RowsAtCompileTime) == 1 ? inner
+      : int(Traits::Flags)&RowMajorBit ? inner
+      : outer;
+  }
+  
+protected:
+  DstEvaluatorType& m_dst;
+  const SrcEvaluatorType& m_src;
+  const Functor &m_functor;
+  // TODO find a way to avoid the needs of the original expression
+  DstXprType& m_dstExpr;
+};
+
+template<typename DstXprType, typename SrcXprType, typename Functor>
+void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func)
+{
+#ifdef EIGEN_DEBUG_ASSIGN
+  // TODO these traits should be computed from information provided by the evaluators
+  internal::copy_using_evaluator_traits<DstXprType, SrcXprType>::debug();
+#endif
+  eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+  
+  typedef typename evaluator<DstXprType>::type DstEvaluatorType;
+  typedef typename evaluator<SrcXprType>::type SrcEvaluatorType;
+
+  DstEvaluatorType dstEvaluator(dst);
+  SrcEvaluatorType srcEvaluator(src);
+    
+  typedef generic_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
+  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
+  
+  dense_assignment_loop<Kernel>::run(kernel);
+}
+
+template<typename DstXprType, typename SrcXprType>
+void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src)
+{
+  call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar>());
+}
+
+/***************************************************************************
+* Part 5 : Entry points
+***************************************************************************/
+
+// Based on DenseBase::LazyAssign()
+// The following functions are just for testing and they are meant to be moved to operator= and the likes.
+
+template<typename DstXprType, template <typename> class StorageBase, typename SrcXprType>
+EIGEN_STRONG_INLINE
+const DstXprType& copy_using_evaluator(const NoAlias<DstXprType, StorageBase>& dst, 
+                                       const EigenBase<SrcXprType>& src)
+{
+  return noalias_copy_using_evaluator(dst.expression(), src.derived(), internal::assign_op<typename DstXprType::Scalar>());
+}
+
+template<typename XprType, int AssumeAliasing = evaluator_traits<XprType>::AssumeAliasing>
+struct AddEvalIfAssumingAliasing;
+
+template<typename XprType>
+struct AddEvalIfAssumingAliasing<XprType, 0>
+{
+  static const XprType& run(const XprType& xpr) 
+  {
+    return xpr;
+  }
+};
+
+template<typename XprType>
+struct AddEvalIfAssumingAliasing<XprType, 1>
+{
+  static const EvalToTemp<XprType> run(const XprType& xpr)
+  {
+    return EvalToTemp<XprType>(xpr);
+  }
+};
+
+template<typename DstXprType, typename SrcXprType, typename Functor>
+EIGEN_STRONG_INLINE
+const DstXprType& copy_using_evaluator(const EigenBase<DstXprType>& dst, const EigenBase<SrcXprType>& src, const Functor &func)
+{
+  return noalias_copy_using_evaluator(dst.const_cast_derived(), 
+                                      AddEvalIfAssumingAliasing<SrcXprType>::run(src.derived()),
+                                      func
+                                     );
+}
+
+// this mimics operator=
+template<typename DstXprType, typename SrcXprType>
+EIGEN_STRONG_INLINE
+const DstXprType& copy_using_evaluator(const EigenBase<DstXprType>& dst, const EigenBase<SrcXprType>& src)
+{
+  return copy_using_evaluator(dst.const_cast_derived(), src.derived(), internal::assign_op<typename DstXprType::Scalar>());
+}
+
+template<typename DstXprType, typename SrcXprType, typename Functor>
+EIGEN_STRONG_INLINE
+const DstXprType& noalias_copy_using_evaluator(const PlainObjectBase<DstXprType>& dst, const EigenBase<SrcXprType>& src, const Functor &func)
+{
+#ifdef EIGEN_DEBUG_ASSIGN
+  internal::copy_using_evaluator_traits<DstXprType, SrcXprType>::debug();
+#endif
+#ifdef EIGEN_NO_AUTOMATIC_RESIZING
+  eigen_assert((dst.size()==0 || (IsVectorAtCompileTime ? (dst.size() == src.size())
+                                                        : (dst.rows() == src.rows() && dst.cols() == src.cols())))
+              && "Size mismatch. Automatic resizing is disabled because EIGEN_NO_AUTOMATIC_RESIZING is defined");
+#else
+  dst.const_cast_derived().resizeLike(src.derived());
+#endif
+  call_dense_assignment_loop(dst.const_cast_derived(), src.derived(), func);
+  return dst.derived();
+}
+
+template<typename DstXprType, typename SrcXprType, typename Functor>
+EIGEN_STRONG_INLINE
+const DstXprType& noalias_copy_using_evaluator(const EigenBase<DstXprType>& dst, const EigenBase<SrcXprType>& src, const Functor &func)
+{
+  call_dense_assignment_loop(dst.const_cast_derived(), src.derived(), func);
+  return dst.derived();
+}
+
+// Based on DenseBase::swap()
+// TODO: Check whether we need to do something special for swapping two
+//       Arrays or Matrices. (Jitse)
+
+// Overload default assignPacket behavior for swapping them
+template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT>
+class swap_kernel : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar> >
+{
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar> > Base;
+  typedef typename DstEvaluatorTypeT::PacketScalar PacketScalar;
+  using Base::m_dst;
+  using Base::m_src;
+  using Base::m_functor;
+  
+public:
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::Index Index;
+  typedef typename Base::DstXprType DstXprType;
+  
+  swap_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, DstXprType& dstExpr)
+    : Base(dst, src, swap_assign_op<Scalar>(), dstExpr)
+  {}
+  
+  template<int StoreMode, int LoadMode>
+  void assignPacket(Index row, Index col)
+  {
+    m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(row,col), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(row,col));
+  }
+  
+  template<int StoreMode, int LoadMode>
+  void assignPacket(Index index)
+  {
+    m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(index), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(index));
+  }
+  
+  // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael)
+  template<int StoreMode, int LoadMode>
+  void assignPacketByOuterInner(Index outer, Index inner)
+  {
+    Index row = Base::rowIndexByOuterInner(outer, inner); 
+    Index col = Base::colIndexByOuterInner(outer, inner);
+    assignPacket<StoreMode,LoadMode>(row, col);
+  }
+};
+  
+template<typename DstXprType, typename SrcXprType>
+void swap_using_evaluator(const DstXprType& dst, const SrcXprType& src)
+{
+  // TODO there is too much redundancy with call_dense_assignment_loop
+  
+  eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+  
+  typedef typename evaluator<DstXprType>::type DstEvaluatorType;
+  typedef typename evaluator<SrcXprType>::type SrcEvaluatorType;
+
+  DstEvaluatorType dstEvaluator(dst);
+  SrcEvaluatorType srcEvaluator(src);
+    
+  typedef swap_kernel<DstEvaluatorType,SrcEvaluatorType> Kernel;
+  Kernel kernel(dstEvaluator, srcEvaluator, dst.const_cast_derived());
+  
+  dense_assignment_loop<Kernel>::run(kernel);
+}
+
+// Based on MatrixBase::operator+= (in CwiseBinaryOp.h)
+template<typename DstXprType, typename SrcXprType>
+void add_assign_using_evaluator(const MatrixBase<DstXprType>& dst, const MatrixBase<SrcXprType>& src)
+{
+  typedef typename DstXprType::Scalar Scalar;
+  copy_using_evaluator(dst.derived(), src.derived(), add_assign_op<Scalar>());
+}
+
+// Based on ArrayBase::operator+=
+template<typename DstXprType, typename SrcXprType>
+void add_assign_using_evaluator(const ArrayBase<DstXprType>& dst, const ArrayBase<SrcXprType>& src)
+{
+  typedef typename DstXprType::Scalar Scalar;
+  copy_using_evaluator(dst.derived(), src.derived(), add_assign_op<Scalar>());
+}
+
+// TODO: Add add_assign_using_evaluator for EigenBase ? (Jitse)
+
+template<typename DstXprType, typename SrcXprType>
+void subtract_assign_using_evaluator(const MatrixBase<DstXprType>& dst, const MatrixBase<SrcXprType>& src)
+{
+  typedef typename DstXprType::Scalar Scalar;
+  copy_using_evaluator(dst.derived(), src.derived(), sub_assign_op<Scalar>());
+}
+
+template<typename DstXprType, typename SrcXprType>
+void subtract_assign_using_evaluator(const ArrayBase<DstXprType>& dst, const ArrayBase<SrcXprType>& src)
+{
+  typedef typename DstXprType::Scalar Scalar;
+  copy_using_evaluator(dst.derived(), src.derived(), sub_assign_op<Scalar>());
+}
+
+template<typename DstXprType, typename SrcXprType>
+void multiply_assign_using_evaluator(const ArrayBase<DstXprType>& dst, const ArrayBase<SrcXprType>& src)
+{
+  typedef typename DstXprType::Scalar Scalar;
+  copy_using_evaluator(dst.derived(), src.derived(), mul_assign_op<Scalar>());
+}
+
+template<typename DstXprType, typename SrcXprType>
+void divide_assign_using_evaluator(const ArrayBase<DstXprType>& dst, const ArrayBase<SrcXprType>& src)
+{
+  typedef typename DstXprType::Scalar Scalar;
+  copy_using_evaluator(dst.derived(), src.derived(), div_assign_op<Scalar>());
+}
+
+
+} // namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_ASSIGN_EVALUATOR_H
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@@ -202,6 +202,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(asin, Asin)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(cos,  Cos)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(acos, Acos)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(tan,  Tan)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(atan,  Atan)
 //EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs,  Abs)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(exp,  Exp)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(log,  Ln)
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -21,6 +21,9 @@ namespace Eigen {
  * \param XprType the type of the expression in which we are taking a block
  * \param BlockRows the number of rows of the block we are taking at compile time (optional)
  * \param BlockCols the number of columns of the block we are taking at compile time (optional)
+  * \param InnerPanel is true, if the block maps to a set of rows of a row major matrix or
+  *        to set of columns of a column major matrix (optional). The parameter allows to determine
+  *        at compile time whether aligned access is possible on the block expression.
  *
  * This class represents an expression of either a fixed-size or dynamic-size block. It is the return
  * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and
@@ -80,8 +83,8 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
    MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
                       && (InnerStrideAtCompileTime == 1)
                        ? PacketAccessBit : 0,
-    MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0,
-    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
+    MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) ? AlignedBit : 0,
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (traits<XprType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,
    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
    FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
    Flags0 = traits<XprType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
@@ -111,6 +114,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
  
    /** Column or Row constructor
      */
+    EIGEN_DEVICE_FUNC
    inline Block(XprType& xpr, Index i) : Impl(xpr,i)
    {
      eigen_assert( (i>=0) && (
@@ -120,6 +124,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class

    /** Fixed-size constructor
      */
+    EIGEN_DEVICE_FUNC
    inline Block(XprType& xpr, Index a_startRow, Index a_startCol)
      : Impl(xpr, a_startRow, a_startCol)
    {
@@ -130,6 +135,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class

    /** Dynamic-size constructor
      */
+    EIGEN_DEVICE_FUNC
    inline Block(XprType& xpr,
          Index a_startRow, Index a_startCol,
          Index blockRows, Index blockCols)
@@ -153,8 +159,9 @@ class BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, Dense>
  public:
    typedef Impl Base;
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
-    inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
-    inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol) : Impl(xpr, a_startRow, a_startCol) {}
+    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
+    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol) : Impl(xpr, a_startRow, a_startCol) {}
+    EIGEN_DEVICE_FUNC
    inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol, Index blockRows, Index blockCols)
      : Impl(xpr, a_startRow, a_startCol, blockRows, blockCols) {}
 };
@@ -176,6 +183,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H

    /** Column or Row constructor
      */
+    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr, Index i)
      : m_xpr(xpr),
        // It is a row if and only if BlockRows==1 and BlockCols==XprType::ColsAtCompileTime,
@@ -190,6 +198,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H

    /** Fixed-size constructor
      */
+    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr, Index a_startRow, Index a_startCol)
      : m_xpr(xpr), m_startRow(a_startRow), m_startCol(a_startCol),
                    m_blockRows(BlockRows), m_blockCols(BlockCols)
@@ -197,6 +206,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H

    /** Dynamic-size constructor
      */
+    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr,
          Index a_startRow, Index a_startCol,
          Index blockRows, Index blockCols)
@@ -204,9 +214,10 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
                    m_blockRows(blockRows), m_blockCols(blockCols)
    {}

-    inline Index rows() const { return m_blockRows.value(); }
-    inline Index cols() const { return m_blockCols.value(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_blockRows.value(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_blockCols.value(); }

+    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index rowId, Index colId)
    {
      EIGEN_STATIC_ASSERT_LVALUE(XprType)
@@ -214,17 +225,20 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
               .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
    }

+    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
      return m_xpr.derived()
               .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
    }

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const
    {
      return m_xpr.coeff(rowId + m_startRow.value(), colId + m_startCol.value());
    }

+    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index index)
    {
      EIGEN_STATIC_ASSERT_LVALUE(XprType)
@@ -233,6 +247,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
    }

+    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
      return m_xpr.const_cast_derived()
@@ -240,6 +255,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
    }

+    EIGEN_DEVICE_FUNC
    inline const CoeffReturnType coeff(Index index) const
    {
      return m_xpr
@@ -279,21 +295,24 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H

    #ifdef EIGEN_PARSED_BY_DOXYGEN
    /** \sa MapBase::data() */
-    inline const Scalar* data() const;
-    inline Index innerStride() const;
-    inline Index outerStride() const;
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const;
+    EIGEN_DEVICE_FUNC inline Index innerStride() const;
+    EIGEN_DEVICE_FUNC inline Index outerStride() const;
    #endif

-    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const 
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const
    { 
      return m_xpr; 
    }
      
-    Index startRow() const 
+    EIGEN_DEVICE_FUNC
+    Index startRow() const
    { 
      return m_startRow.value(); 
    }
      
+    EIGEN_DEVICE_FUNC
    Index startCol() const 
    { 
      return m_startCol.value(); 
@@ -322,6 +341,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>

    /** Column or Row constructor
      */
+    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr, Index i)
      : Base(internal::const_cast_ptr(&xpr.coeffRef(
              (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0,
@@ -335,6 +355,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>

    /** Fixed-size constructor
      */
+    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
      : Base(internal::const_cast_ptr(&xpr.coeffRef(startRow,startCol))), m_xpr(xpr)
    {
@@ -343,6 +364,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>

    /** Dynamic-size constructor
      */
+    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr,
          Index startRow, Index startCol,
          Index blockRows, Index blockCols)
@@ -352,12 +374,14 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
      init();
    }

-    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const 
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const
    { 
      return m_xpr; 
    }
      
    /** \sa MapBase::innerStride() */
+    EIGEN_DEVICE_FUNC
    inline Index innerStride() const
    {
      return internal::traits<BlockType>::HasSameStorageOrderAsXprType
@@ -366,6 +390,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
    }

    /** \sa MapBase::outerStride() */
+    EIGEN_DEVICE_FUNC
    inline Index outerStride() const
    {
      return m_outerStride;
@@ -379,6 +404,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>

    #ifndef EIGEN_PARSED_BY_DOXYGEN
    /** \internal used by allowAligned() */
+    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols)
      : Base(data, blockRows, blockCols), m_xpr(xpr)
    {
@@ -387,6 +413,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
    #endif

  protected:
+    EIGEN_DEVICE_FUNC
    void init()
    {
      m_outerStride = internal::traits<BlockType>::HasSameStorageOrderAsXprType
--- a/Eigen/src/Core/BooleanRedux.h
+++ b/Eigen/src/Core/BooleanRedux.h
@@ -29,9 +29,9 @@ struct all_unroller
 };

 template<typename Derived>
-struct all_unroller<Derived, 1>
+struct all_unroller<Derived, 0>
 {
-  static inline bool run(const Derived &mat) { return mat.coeff(0, 0); }
+  static inline bool run(const Derived &/*mat*/) { return true; }
 };

 template<typename Derived>
@@ -55,9 +55,9 @@ struct any_unroller
 };

 template<typename Derived>
-struct any_unroller<Derived, 1>
+struct any_unroller<Derived, 0>
 {
-  static inline bool run(const Derived &mat) { return mat.coeff(0, 0); }
+  static inline bool run(const Derived & /*mat*/) { return false; }
 };

 template<typename Derived>
--- a/Eigen/src/Core/CMakeLists.txt
+++ b/Eigen/src/Core/CMakeLists.txt
@@ -8,3 +8,4 @@ INSTALL(FILES
 ADD_SUBDIRECTORY(products)
 ADD_SUBDIRECTORY(util)
 ADD_SUBDIRECTORY(arch)
+ADD_SUBDIRECTORY(functors)
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h
@@ -30,6 +30,7 @@ struct CommaInitializer
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::Index Index;

+  EIGEN_DEVICE_FUNC
  inline CommaInitializer(XprType& xpr, const Scalar& s)
    : m_xpr(xpr), m_row(0), m_col(1), m_currentBlockRows(1)
  {
@@ -37,13 +38,27 @@ struct CommaInitializer
  }

  template<typename OtherDerived>
+  EIGEN_DEVICE_FUNC
  inline CommaInitializer(XprType& xpr, const DenseBase<OtherDerived>& other)
    : m_xpr(xpr), m_row(0), m_col(other.cols()), m_currentBlockRows(other.rows())
  {
    m_xpr.block(0, 0, other.rows(), other.cols()) = other;
  }

+  /* Copy/Move constructor which transfers ownership. This is crucial in 
+   * absence of return value optimization to avoid assertions during destruction. */
+  // FIXME in C++11 mode this could be replaced by a proper RValue constructor
+  EIGEN_DEVICE_FUNC
+  inline CommaInitializer(const CommaInitializer& o)
+  : m_xpr(o.m_xpr), m_row(o.m_row), m_col(o.m_col), m_currentBlockRows(o.m_currentBlockRows) {
+    // Mark original object as finished. In absence of R-value references we need to const_cast:
+    const_cast<CommaInitializer&>(o).m_row = m_xpr.rows();
+    const_cast<CommaInitializer&>(o).m_col = m_xpr.cols();
+    const_cast<CommaInitializer&>(o).m_currentBlockRows = 0;
+  }
+
  /* inserts a scalar value in the target matrix */
+  EIGEN_DEVICE_FUNC
  CommaInitializer& operator,(const Scalar& s)
  {
    if (m_col==m_xpr.cols())
@@ -63,6 +78,7 @@ struct CommaInitializer

  /* inserts a matrix expression in the target matrix */
  template<typename OtherDerived>
+  EIGEN_DEVICE_FUNC
  CommaInitializer& operator,(const DenseBase<OtherDerived>& other)
  {
    if(other.cols()==0 || other.rows()==0)
@@ -88,6 +104,7 @@ struct CommaInitializer
    return *this;
  }

+  EIGEN_DEVICE_FUNC
  inline ~CommaInitializer()
  {
    eigen_assert((m_row+m_currentBlockRows) == m_xpr.rows()
@@ -102,9 +119,10 @@ struct CommaInitializer
    * quaternion.fromRotationMatrix((Matrix3f() << axis0, axis1, axis2).finished());
    * \endcode
    */
+  EIGEN_DEVICE_FUNC
  inline XprType& finished() { return m_xpr; }

-  XprType& m_xpr;   // target expression
+  XprType& m_xpr;           // target expression
  Index m_row;              // current row id
  Index m_col;              // current col id
  Index m_currentBlockRows; // current block height
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -122,6 +122,7 @@ class CwiseBinaryOp : internal::no_assignment_operator,
    typedef typename internal::remove_reference<LhsNested>::type _LhsNested;
    typedef typename internal::remove_reference<RhsNested>::type _RhsNested;

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp())
      : m_lhs(aLhs), m_rhs(aRhs), m_functor(func)
    {
@@ -131,6 +132,7 @@ class CwiseBinaryOp : internal::no_assignment_operator,
      eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
    }

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index rows() const {
      // return the fixed size type if available to enable compile time optimizations
      if (internal::traits<typename internal::remove_all<LhsNested>::type>::RowsAtCompileTime==Dynamic)
@@ -138,6 +140,7 @@ class CwiseBinaryOp : internal::no_assignment_operator,
      else
        return m_lhs.rows();
    }
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index cols() const {
      // return the fixed size type if available to enable compile time optimizations
      if (internal::traits<typename internal::remove_all<LhsNested>::type>::ColsAtCompileTime==Dynamic)
@@ -147,10 +150,13 @@ class CwiseBinaryOp : internal::no_assignment_operator,
    }

    /** \returns the left hand side nested expression */
+    EIGEN_DEVICE_FUNC
    const _LhsNested& lhs() const { return m_lhs; }
    /** \returns the right hand side nested expression */
+    EIGEN_DEVICE_FUNC
    const _RhsNested& rhs() const { return m_rhs; }
    /** \returns the functor representing the binary operation */
+    EIGEN_DEVICE_FUNC
    const BinaryOp& functor() const { return m_functor; }

  protected:
@@ -169,6 +175,7 @@ class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Dense>
    typedef typename internal::dense_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type Base;
    EIGEN_DENSE_PUBLIC_INTERFACE( Derived )

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
    {
      return derived().functor()(derived().lhs().coeff(rowId, colId),
@@ -182,6 +189,7 @@ class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Dense>
                                          derived().rhs().template packet<LoadMode>(rowId, colId));
    }

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
    {
      return derived().functor()(derived().lhs().coeff(index),
@@ -227,3 +235,4 @@ MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 } // end namespace Eigen

 #endif // EIGEN_CWISE_BINARY_OP_H
+
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -54,6 +54,7 @@ class CwiseNullaryOp : internal::no_assignment_operator,
    typedef typename internal::dense_xpr_base<CwiseNullaryOp>::type Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(CwiseNullaryOp)

+    EIGEN_DEVICE_FUNC
    CwiseNullaryOp(Index nbRows, Index nbCols, const NullaryOp& func = NullaryOp())
      : m_rows(nbRows), m_cols(nbCols), m_functor(func)
    {
@@ -63,9 +64,12 @@ class CwiseNullaryOp : internal::no_assignment_operator,
            && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == nbCols));
    }

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index rows() const { return m_rows.value(); }
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index cols() const { return m_cols.value(); }

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
    {
      return m_functor(rowId, colId);
@@ -77,6 +81,7 @@ class CwiseNullaryOp : internal::no_assignment_operator,
      return m_functor.packetOp(rowId, colId);
    }

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
    {
      return m_functor(index);
@@ -89,6 +94,7 @@ class CwiseNullaryOp : internal::no_assignment_operator,
    }

    /** \returns the functor representing the nullary operation */
+    EIGEN_DEVICE_FUNC
    const NullaryOp& functor() const { return m_functor; }

  protected:
@@ -132,6 +138,9 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
  *
  * The template parameter \a CustomNullaryOp is the type of the functor.
  *
+  * Here is an example with C++11 random generators: \include random_cpp11.cpp
+  * Output: \verbinclude random_cpp11.out
+  * 
  * \sa class CwiseNullaryOp
  */
 template<typename Derived>
@@ -740,6 +749,7 @@ namespace internal {
 template<typename Derived, bool Big = (Derived::SizeAtCompileTime>=16)>
 struct setIdentity_impl
 {
+  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Derived& run(Derived& m)
  {
    return m = Derived::Identity(m.rows(), m.cols());
@@ -750,6 +760,7 @@ template<typename Derived>
 struct setIdentity_impl<Derived, true>
 {
  typedef typename Derived::Index Index;
+  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Derived& run(Derived& m)
  {
    m.setZero();
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@@ -64,20 +64,26 @@ class CwiseUnaryOp : internal::no_assignment_operator,
    typedef typename CwiseUnaryOpImpl<UnaryOp, XprType,typename internal::traits<XprType>::StorageKind>::Base Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryOp)

+    EIGEN_DEVICE_FUNC
    inline CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
      : m_xpr(xpr), m_functor(func) {}

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index rows() const { return m_xpr.rows(); }
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index cols() const { return m_xpr.cols(); }

    /** \returns the functor representing the unary operation */
+    EIGEN_DEVICE_FUNC
    const UnaryOp& functor() const { return m_functor; }

    /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC
    const typename internal::remove_all<typename XprType::Nested>::type&
    nestedExpression() const { return m_xpr; }

    /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC
    typename internal::remove_all<typename XprType::Nested>::type&
    nestedExpression() { return m_xpr.const_cast_derived(); }

@@ -98,6 +104,7 @@ class CwiseUnaryOpImpl<UnaryOp,XprType,Dense>
    typedef typename internal::dense_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
    {
      return derived().functor()(derived().nestedExpression().coeff(rowId, colId));
@@ -109,12 +116,14 @@ class CwiseUnaryOpImpl<UnaryOp,XprType,Dense>
      return derived().functor().packetOp(derived().nestedExpression().template packet<LoadMode>(rowId, colId));
    }

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
    {
      return derived().functor()(derived().nestedExpression().coeff(index));
    }

    template<int LoadMode>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
    {
      return derived().functor().packetOp(derived().nestedExpression().template packet<LoadMode>(index));
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -182,6 +182,7 @@ template<typename Derived> class DenseBase

    /** \returns the number of nonzero coefficients which is in practice the number
      * of stored coefficients. */
+    EIGEN_DEVICE_FUNC
    inline Index nonZeros() const { return size(); }
    /** \returns true if either the number of rows or the number of columns is equal to 1.
      * In other words, this function returns
@@ -193,6 +194,7 @@ template<typename Derived> class DenseBase
      * \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension
      * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of columns for a
      * column-major matrix, and the number of rows for a row-major matrix. */
+    EIGEN_DEVICE_FUNC
    Index outerSize() const
    {
      return IsVectorAtCompileTime ? 1
@@ -204,6 +206,7 @@ template<typename Derived> class DenseBase
      * \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension
      * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a 
      * column-major matrix, and the number of columns for a row-major matrix. */
+    EIGEN_DEVICE_FUNC
    Index innerSize() const
    {
      return IsVectorAtCompileTime ? this->size()
@@ -214,6 +217,7 @@ template<typename Derived> class DenseBase
      * Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and does
      * nothing else.
      */
+    EIGEN_DEVICE_FUNC
    void resize(Index newSize)
    {
      EIGEN_ONLY_USED_FOR_DEBUG(newSize);
@@ -224,6 +228,7 @@ template<typename Derived> class DenseBase
      * Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and does
      * nothing else.
      */
+    EIGEN_DEVICE_FUNC
    void resize(Index nbRows, Index nbCols)
    {
      EIGEN_ONLY_USED_FOR_DEBUG(nbRows);
@@ -247,42 +252,54 @@ template<typename Derived> class DenseBase

    /** Copies \a other into *this. \returns a reference to *this. */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    Derived& operator=(const DenseBase<OtherDerived>& other);

    /** Special case of the template operator=, in order to prevent the compiler
      * from generating a default operator= (issue hit with g++ 4.1)
      */
+    EIGEN_DEVICE_FUNC
    Derived& operator=(const DenseBase& other);

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    Derived& operator=(const EigenBase<OtherDerived> &other);

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    Derived& operator+=(const EigenBase<OtherDerived> &other);

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    Derived& operator-=(const EigenBase<OtherDerived> &other);

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    Derived& operator=(const ReturnByValue<OtherDerived>& func);

 #ifndef EIGEN_PARSED_BY_DOXYGEN
    /** Copies \a other into *this without evaluating other. \returns a reference to *this. */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    Derived& lazyAssign(const DenseBase<OtherDerived>& other);
 #endif // not EIGEN_PARSED_BY_DOXYGEN

+    EIGEN_DEVICE_FUNC
    CommaInitializer<Derived> operator<< (const Scalar& s);

    template<unsigned int Added,unsigned int Removed>
    const Flagged<Derived, Added, Removed> flagged() const;

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    CommaInitializer<Derived> operator<< (const DenseBase<OtherDerived>& other);

+    EIGEN_DEVICE_FUNC
    Eigen::Transpose<Derived> transpose();
-	typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
+    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
+    EIGEN_DEVICE_FUNC
    ConstTransposeReturnType transpose() const;
+    EIGEN_DEVICE_FUNC
    void transposeInPlace();
 #ifndef EIGEN_NO_DEBUG
  protected:
@@ -292,65 +309,68 @@ template<typename Derived> class DenseBase
 #endif


-    static const ConstantReturnType
+    EIGEN_DEVICE_FUNC static const ConstantReturnType
    Constant(Index rows, Index cols, const Scalar& value);
-    static const ConstantReturnType
+    EIGEN_DEVICE_FUNC static const ConstantReturnType
    Constant(Index size, const Scalar& value);
-    static const ConstantReturnType
+    EIGEN_DEVICE_FUNC static const ConstantReturnType
    Constant(const Scalar& value);

-    static const SequentialLinSpacedReturnType
+    EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType
    LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high);
-    static const RandomAccessLinSpacedReturnType
+    EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
    LinSpaced(Index size, const Scalar& low, const Scalar& high);
-    static const SequentialLinSpacedReturnType
+    EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType
    LinSpaced(Sequential_t, const Scalar& low, const Scalar& high);
-    static const RandomAccessLinSpacedReturnType
+    EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
    LinSpaced(const Scalar& low, const Scalar& high);

-    template<typename CustomNullaryOp>
+    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
    static const CwiseNullaryOp<CustomNullaryOp, Derived>
    NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func);
-    template<typename CustomNullaryOp>
+    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
    static const CwiseNullaryOp<CustomNullaryOp, Derived>
    NullaryExpr(Index size, const CustomNullaryOp& func);
-    template<typename CustomNullaryOp>
+    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
    static const CwiseNullaryOp<CustomNullaryOp, Derived>
    NullaryExpr(const CustomNullaryOp& func);

-    static const ConstantReturnType Zero(Index rows, Index cols);
-    static const ConstantReturnType Zero(Index size);
-    static const ConstantReturnType Zero();
-    static const ConstantReturnType Ones(Index rows, Index cols);
-    static const ConstantReturnType Ones(Index size);
-    static const ConstantReturnType Ones();
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Zero(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Zero(Index size);
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Zero();
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Ones(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Ones(Index size);
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Ones();

-    void fill(const Scalar& value);
-    Derived& setConstant(const Scalar& value);
-    Derived& setLinSpaced(Index size, const Scalar& low, const Scalar& high);
-    Derived& setLinSpaced(const Scalar& low, const Scalar& high);
-    Derived& setZero();
-    Derived& setOnes();
-    Derived& setRandom();
+    EIGEN_DEVICE_FUNC void fill(const Scalar& value);
+    EIGEN_DEVICE_FUNC Derived& setConstant(const Scalar& value);
+    EIGEN_DEVICE_FUNC Derived& setLinSpaced(Index size, const Scalar& low, const Scalar& high);
+    EIGEN_DEVICE_FUNC Derived& setLinSpaced(const Scalar& low, const Scalar& high);
+    EIGEN_DEVICE_FUNC Derived& setZero();
+    EIGEN_DEVICE_FUNC Derived& setOnes();
+    EIGEN_DEVICE_FUNC Derived& setRandom();

-    template<typename OtherDerived>
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC
    bool isApprox(const DenseBase<OtherDerived>& other,
                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    EIGEN_DEVICE_FUNC 
    bool isMuchSmallerThan(const RealScalar& other,
                           const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    template<typename OtherDerived>
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC
    bool isMuchSmallerThan(const DenseBase<OtherDerived>& other,
                           const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;

-    bool isApproxToConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    bool isZero(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    bool isOnes(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    EIGEN_DEVICE_FUNC bool isApproxToConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    EIGEN_DEVICE_FUNC bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    EIGEN_DEVICE_FUNC bool isZero(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    EIGEN_DEVICE_FUNC bool isOnes(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
    
    inline bool hasNaN() const;
    inline bool allFinite() const;

+    EIGEN_DEVICE_FUNC
    inline Derived& operator*=(const Scalar& other);
+    EIGEN_DEVICE_FUNC
    inline Derived& operator/=(const Scalar& other);

    typedef typename internal::add_const_on_value_type<typename internal::eval<Derived>::type>::type EvalReturnType;
@@ -359,6 +379,7 @@ template<typename Derived> class DenseBase
      * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
      * a const reference, in order to avoid a useless copy.
      */
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE EvalReturnType eval() const
    {
      // Even though MSVC does not honor strong inlining when the return type
@@ -371,6 +392,7 @@ template<typename Derived> class DenseBase
      *
      */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    void swap(const DenseBase<OtherDerived>& other,
              int = OtherDerived::ThisConstantIsPrivateInPlainObjectBase)
    {
@@ -381,46 +403,52 @@ template<typename Derived> class DenseBase
      *
      */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    void swap(PlainObjectBase<OtherDerived>& other)
    {
      SwapWrapper<Derived>(derived()).lazyAssign(other.derived());
    }


-    inline const NestByValue<Derived> nestByValue() const;
-    inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
-    inline ForceAlignedAccess<Derived> forceAlignedAccess();
-    template<bool Enable> inline const typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf() const;
-    template<bool Enable> inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf();
+    EIGEN_DEVICE_FUNC inline const NestByValue<Derived> nestByValue() const;
+    EIGEN_DEVICE_FUNC inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
+    EIGEN_DEVICE_FUNC inline ForceAlignedAccess<Derived> forceAlignedAccess();
+    template<bool Enable> EIGEN_DEVICE_FUNC
+    inline const typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf() const;
+    template<bool Enable> EIGEN_DEVICE_FUNC
+    inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf();

-    Scalar sum() const;
-    Scalar mean() const;
-    Scalar trace() const;
+    EIGEN_DEVICE_FUNC Scalar sum() const;
+    EIGEN_DEVICE_FUNC Scalar mean() const;
+    EIGEN_DEVICE_FUNC Scalar trace() const;

-    Scalar prod() const;
+    EIGEN_DEVICE_FUNC Scalar prod() const;

-    typename internal::traits<Derived>::Scalar minCoeff() const;
-    typename internal::traits<Derived>::Scalar maxCoeff() const;
+    EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar minCoeff() const;
+    EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar maxCoeff() const;

-    template<typename IndexType>
+    template<typename IndexType> EIGEN_DEVICE_FUNC
    typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const;
-    template<typename IndexType>
+    template<typename IndexType> EIGEN_DEVICE_FUNC
    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* row, IndexType* col) const;
-    template<typename IndexType>
+    template<typename IndexType> EIGEN_DEVICE_FUNC
    typename internal::traits<Derived>::Scalar minCoeff(IndexType* index) const;
-    template<typename IndexType>
+    template<typename IndexType> EIGEN_DEVICE_FUNC
    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const;

    template<typename BinaryOp>
+    EIGEN_DEVICE_FUNC
    typename internal::result_of<BinaryOp(typename internal::traits<Derived>::Scalar)>::type
    redux(const BinaryOp& func) const;

    template<typename Visitor>
+    EIGEN_DEVICE_FUNC
    void visit(Visitor& func) const;

    inline const WithFormat<Derived> format(const IOFormat& fmt) const;

    /** \returns the unique coefficient of a 1x1 expression */
+    EIGEN_DEVICE_FUNC
    CoeffReturnType value() const
    {
      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
@@ -428,8 +456,8 @@ template<typename Derived> class DenseBase
      return derived().coeff(0,0);
    }

-    bool all(void) const;
-    bool any(void) const;
+    bool all() const;
+    bool any() const;
    Index count() const;

    typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
@@ -478,27 +506,18 @@ template<typename Derived> class DenseBase
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS

-#ifdef EIGEN2_SUPPORT
-
-    Block<Derived> corner(CornerType type, Index cRows, Index cCols);
-    const Block<Derived> corner(CornerType type, Index cRows, Index cCols) const;
-    template<int CRows, int CCols>
-    Block<Derived, CRows, CCols> corner(CornerType type);
-    template<int CRows, int CCols>
-    const Block<Derived, CRows, CCols> corner(CornerType type) const;
-
-#endif // EIGEN2_SUPPORT
-

    // disable the use of evalTo for dense objects with a nice compilation error
-    template<typename Dest> inline void evalTo(Dest& ) const
+    template<typename Dest>
+    EIGEN_DEVICE_FUNC
+    inline void evalTo(Dest& ) const
    {
      EIGEN_STATIC_ASSERT((internal::is_same<Dest,void>::value),THE_EVAL_EVALTO_FUNCTION_SHOULD_NEVER_BE_CALLED_FOR_DENSE_OBJECTS);
    }

  protected:
    /** Default constructor. Do nothing. */
-    DenseBase()
+    EIGEN_DEVICE_FUNC DenseBase()
    {
      /* Just checks for self-consistency of the flags.
       * Only do it when debugging Eigen, as this borders on paranoiac and could slow compilation down
@@ -511,9 +530,9 @@ template<typename Derived> class DenseBase
    }

  private:
-    explicit DenseBase(int);
-    DenseBase(int,int);
-    template<typename OtherDerived> explicit DenseBase(const DenseBase<OtherDerived>&);
+    EIGEN_DEVICE_FUNC explicit DenseBase(int);
+    EIGEN_DEVICE_FUNC DenseBase(int,int);
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC explicit DenseBase(const DenseBase<OtherDerived>&);
 };

 } // end namespace Eigen
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@@ -61,6 +61,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
    using Base::size;
    using Base::derived;

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) const
    {
      return int(Derived::RowsAtCompileTime) == 1 ? 0
@@ -69,6 +70,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
          : inner;
    }

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner) const
    {
      return int(Derived::ColsAtCompileTime) == 1 ? 0
@@ -91,6 +93,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
      *
      * \sa operator()(Index,Index) const, coeffRef(Index,Index), coeff(Index) const
      */
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const
    {
      eigen_internal_assert(row >= 0 && row < rows()
@@ -98,6 +101,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
      return derived().coeff(row, col);
    }

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
    {
      return coeff(rowIndexByOuterInner(outer, inner),
@@ -108,6 +112,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
      *
      * \sa operator()(Index,Index), operator[](Index)
      */
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType operator()(Index row, Index col) const
    {
      eigen_assert(row >= 0 && row < rows()
@@ -130,6 +135,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
      * \sa operator[](Index) const, coeffRef(Index), coeff(Index,Index) const
      */

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
    coeff(Index index) const
    {
@@ -146,13 +152,12 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
      * z() const, w() const
      */

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
    operator[](Index index) const
    {
-      #ifndef EIGEN2_SUPPORT
      EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                          THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
-      #endif
      eigen_assert(index >= 0 && index < size());
      return derived().coeff(index);
    }
@@ -167,6 +172,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
      * z() const, w() const
      */

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
    operator()(Index index) const
    {
@@ -176,21 +182,25 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>

    /** equivalent to operator[](0).  */

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
    x() const { return (*this)[0]; }

    /** equivalent to operator[](1).  */

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
    y() const { return (*this)[1]; }

    /** equivalent to operator[](2).  */

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
    z() const { return (*this)[2]; }

    /** equivalent to operator[](3).  */

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
    w() const { return (*this)[3]; }

@@ -311,6 +321,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      *
      * \sa operator()(Index,Index), coeff(Index, Index) const, coeffRef(Index)
      */
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col)
    {
      eigen_internal_assert(row >= 0 && row < rows()
@@ -318,6 +329,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      return derived().coeffRef(row, col);
    }

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    coeffRefByOuterInner(Index outer, Index inner)
    {
@@ -330,6 +342,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      * \sa operator[](Index)
      */

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    operator()(Index row, Index col)
    {
@@ -354,6 +367,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      * \sa operator[](Index), coeff(Index) const, coeffRef(Index,Index)
      */

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    coeffRef(Index index)
    {
@@ -368,13 +382,12 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
      */

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    operator[](Index index)
    {
-      #ifndef EIGEN2_SUPPORT
      EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                          THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
-      #endif
      eigen_assert(index >= 0 && index < size());
      return derived().coeffRef(index);
    }
@@ -388,6 +401,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
      */

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    operator()(Index index)
    {
@@ -397,21 +411,25 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,

    /** equivalent to operator[](0).  */

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    x() { return (*this)[0]; }

    /** equivalent to operator[](1).  */

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    y() { return (*this)[1]; }

    /** equivalent to operator[](2).  */

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    z() { return (*this)[2]; }

    /** equivalent to operator[](3).  */

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    w() { return (*this)[3]; }

@@ -473,6 +491,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      */

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void copyCoeff(Index row, Index col, const DenseBase<OtherDerived>& other)
    {
      eigen_internal_assert(row >= 0 && row < rows()
@@ -489,6 +508,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      */

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
    {
      eigen_internal_assert(index >= 0 && index < size());
@@ -497,6 +517,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,


    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void copyCoeffByOuterInner(Index outer, Index inner, const DenseBase<OtherDerived>& other)
    {
      const Index row = rowIndexByOuterInner(outer,inner);
@@ -581,6 +602,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
      *
      * \sa outerStride(), rowStride(), colStride()
      */
+    EIGEN_DEVICE_FUNC
    inline Index innerStride() const
    {
      return derived().innerStride();
@@ -591,6 +613,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
      *
      * \sa innerStride(), rowStride(), colStride()
      */
+    EIGEN_DEVICE_FUNC
    inline Index outerStride() const
    {
      return derived().outerStride();
@@ -606,6 +629,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
      *
      * \sa innerStride(), outerStride(), colStride()
      */
+    EIGEN_DEVICE_FUNC
    inline Index rowStride() const
    {
      return Derived::IsRowMajor ? outerStride() : innerStride();
@@ -615,6 +639,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
      *
      * \sa innerStride(), outerStride(), rowStride()
      */
+    EIGEN_DEVICE_FUNC
    inline Index colStride() const
    {
      return Derived::IsRowMajor ? innerStride() : outerStride();
@@ -652,6 +677,7 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
      *
      * \sa outerStride(), rowStride(), colStride()
      */
+    EIGEN_DEVICE_FUNC
    inline Index innerStride() const
    {
      return derived().innerStride();
@@ -662,6 +688,7 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
      *
      * \sa innerStride(), rowStride(), colStride()
      */
+    EIGEN_DEVICE_FUNC
    inline Index outerStride() const
    {
      return derived().outerStride();
@@ -677,6 +704,7 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
      *
      * \sa innerStride(), outerStride(), colStride()
      */
+    EIGEN_DEVICE_FUNC
    inline Index rowStride() const
    {
      return Derived::IsRowMajor ? outerStride() : innerStride();
@@ -686,6 +714,7 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
      *
      * \sa innerStride(), outerStride(), rowStride()
      */
+    EIGEN_DEVICE_FUNC
    inline Index colStride() const
    {
      return Derived::IsRowMajor ? innerStride() : outerStride();
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -3,7 +3,7 @@
 //
 // Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
+// Copyright (C) 2010-2013 Hauke Heibel <hauke.heibel@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -24,26 +24,38 @@ namespace internal {

 struct constructor_without_unaligned_array_assert {};

+template<typename T, int Size>
+EIGEN_DEVICE_FUNC
+void check_static_allocation_size()
+{
+  // if EIGEN_STACK_ALLOCATION_LIMIT is defined to 0, then no limit
+  #if EIGEN_STACK_ALLOCATION_LIMIT
+  EIGEN_STATIC_ASSERT(Size * sizeof(T) <= EIGEN_STACK_ALLOCATION_LIMIT, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG);
+  #endif
+}
+
 /** \internal
  * Static array. If the MatrixOrArrayOptions require auto-alignment, the array will be automatically aligned:
  * to 16 bytes boundary if the total size is a multiple of 16 bytes.
  */
 template <typename T, int Size, int MatrixOrArrayOptions,
          int Alignment = (MatrixOrArrayOptions&DontAlign) ? 0
-                        : (((Size*sizeof(T))%16)==0) ? 16
+                        : (((Size*sizeof(T))%EIGEN_ALIGN_BYTES)==0) ? EIGEN_ALIGN_BYTES
                        : 0 >
 struct plain_array
 {
  T array[Size];

-  plain_array() 
+  EIGEN_DEVICE_FUNC
+  plain_array()
  { 
-    EIGEN_STATIC_ASSERT(Size * sizeof(T) <= 128 * 128 * 8, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG);
+    check_static_allocation_size<T,Size>();
  }

-  plain_array(constructor_without_unaligned_array_assert) 
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert)
  { 
-    EIGEN_STATIC_ASSERT(Size * sizeof(T) <= 128 * 128 * 8, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG);
+    check_static_allocation_size<T,Size>();
  }
 };

@@ -56,41 +68,43 @@ struct plain_array
  template<typename PtrType>
  EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; }
  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & sizemask) == 0 \
+    eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \
              && "this assertion is explained here: " \
              "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
              " **** READ THIS WEB PAGE !!! ****");
 #else
  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(array) & sizemask) == 0 \
+    eigen_assert((reinterpret_cast<size_t>(array) & (sizemask)) == 0 \
              && "this assertion is explained here: " \
              "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
              " **** READ THIS WEB PAGE !!! ****");
 #endif

 template <typename T, int Size, int MatrixOrArrayOptions>
-struct plain_array<T, Size, MatrixOrArrayOptions, 16>
+struct plain_array<T, Size, MatrixOrArrayOptions, EIGEN_ALIGN_BYTES>
 {
-  EIGEN_USER_ALIGN16 T array[Size];
+  EIGEN_USER_ALIGN_DEFAULT T array[Size];

+  EIGEN_DEVICE_FUNC
  plain_array() 
  { 
-    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(0xf);
-    EIGEN_STATIC_ASSERT(Size * sizeof(T) <= 128 * 128 * 8, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG);
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(EIGEN_ALIGN_BYTES-1);
+    check_static_allocation_size<T,Size>();
  }

+  EIGEN_DEVICE_FUNC
  plain_array(constructor_without_unaligned_array_assert) 
  { 
-    EIGEN_STATIC_ASSERT(Size * sizeof(T) <= 128 * 128 * 8, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG);
+    check_static_allocation_size<T,Size>();
  }
 };

 template <typename T, int MatrixOrArrayOptions, int Alignment>
 struct plain_array<T, 0, MatrixOrArrayOptions, Alignment>
 {
-  EIGEN_USER_ALIGN16 T array[1];
-  plain_array() {}
-  plain_array(constructor_without_unaligned_array_assert) {}
+  EIGEN_USER_ALIGN_DEFAULT T array[1];
+  EIGEN_DEVICE_FUNC plain_array() {}
+  EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {}
 };

 } // end namespace internal
@@ -114,33 +128,44 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
 {
    internal::plain_array<T,Size,_Options> m_data;
  public:
-    inline DenseStorage() {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC DenseStorage() {}
+    EIGEN_DEVICE_FUNC
+    DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()) {}
-    inline DenseStorage(DenseIndex,DenseIndex,DenseIndex) {}
-    inline void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
-    static inline DenseIndex rows(void) {return _Rows;}
-    static inline DenseIndex cols(void) {return _Cols;}
-    inline void conservativeResize(DenseIndex,DenseIndex,DenseIndex) {}
-    inline void resize(DenseIndex,DenseIndex,DenseIndex) {}
-    inline const T *data() const { return m_data.array; }
-    inline T *data() { return m_data.array; }
+    EIGEN_DEVICE_FUNC 
+    DenseStorage(const DenseStorage& other) : m_data(other.m_data) {}
+    EIGEN_DEVICE_FUNC 
+    DenseStorage& operator=(const DenseStorage& other)
+    { 
+      if (this != &other) m_data = other.m_data;
+      return *this; 
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(DenseIndex,DenseIndex,DenseIndex) {}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
+    EIGEN_DEVICE_FUNC static DenseIndex rows(void) {return _Rows;}
+    EIGEN_DEVICE_FUNC static DenseIndex cols(void) {return _Cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(DenseIndex,DenseIndex,DenseIndex) {}
+    EIGEN_DEVICE_FUNC void resize(DenseIndex,DenseIndex,DenseIndex) {}
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };

 // null matrix
 template<typename T, int _Rows, int _Cols, int _Options> class DenseStorage<T, 0, _Rows, _Cols, _Options>
 {
  public:
-    inline DenseStorage() {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert) {}
-    inline DenseStorage(DenseIndex,DenseIndex,DenseIndex) {}
-    inline void swap(DenseStorage& ) {}
-    static inline DenseIndex rows(void) {return _Rows;}
-    static inline DenseIndex cols(void) {return _Cols;}
-    inline void conservativeResize(DenseIndex,DenseIndex,DenseIndex) {}
-    inline void resize(DenseIndex,DenseIndex,DenseIndex) {}
-    inline const T *data() const { return 0; }
-    inline T *data() { return 0; }
+    EIGEN_DEVICE_FUNC DenseStorage() {}
+    EIGEN_DEVICE_FUNC DenseStorage(internal::constructor_without_unaligned_array_assert) {}
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) {}
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) { return *this; }
+    EIGEN_DEVICE_FUNC DenseStorage(DenseIndex,DenseIndex,DenseIndex) {}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& ) {}
+    EIGEN_DEVICE_FUNC static DenseIndex rows(void) {return _Rows;}
+    EIGEN_DEVICE_FUNC static DenseIndex cols(void) {return _Cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(DenseIndex,DenseIndex,DenseIndex) {}
+    EIGEN_DEVICE_FUNC void resize(DenseIndex,DenseIndex,DenseIndex) {}
+    EIGEN_DEVICE_FUNC const T *data() const { return 0; }
+    EIGEN_DEVICE_FUNC T *data() { return 0; }
 };

 // more specializations for null matrices; these are necessary to resolve ambiguities
@@ -160,18 +185,29 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic
    DenseIndex m_rows;
    DenseIndex m_cols;
  public:
-    inline DenseStorage() : m_rows(0), m_cols(0) {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0), m_cols(0) {}
+    DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {}
-    inline DenseStorage(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) : m_rows(nbRows), m_cols(nbCols) {}
-    inline void swap(DenseStorage& other)
+    DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows), m_cols(other.m_cols) {}
+    DenseStorage& operator=(const DenseStorage& other) 
+    { 
+      if (this != &other)
+      {
+        m_data = other.m_data;
+        m_rows = other.m_rows;
+        m_cols = other.m_cols;
+      }
+      return *this; 
+    }
+    DenseStorage(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) : m_rows(nbRows), m_cols(nbCols) {}
+    void swap(DenseStorage& other)
    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
-    inline DenseIndex rows() const {return m_rows;}
-    inline DenseIndex cols() const {return m_cols;}
-    inline void conservativeResize(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) { m_rows = nbRows; m_cols = nbCols; }
-    inline void resize(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) { m_rows = nbRows; m_cols = nbCols; }
-    inline const T *data() const { return m_data.array; }
-    inline T *data() { return m_data.array; }
+    EIGEN_DEVICE_FUNC DenseIndex rows() const {return m_rows;}
+    EIGEN_DEVICE_FUNC DenseIndex cols() const {return m_cols;}
+    void conservativeResize(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) { m_rows = nbRows; m_cols = nbCols; }
+    void resize(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) { m_rows = nbRows; m_cols = nbCols; }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };

 // dynamic-size matrix with fixed-size storage and fixed width
@@ -180,17 +216,27 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si
    internal::plain_array<T,Size,_Options> m_data;
    DenseIndex m_rows;
  public:
-    inline DenseStorage() : m_rows(0) {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0) {}
+    DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {}
-    inline DenseStorage(DenseIndex, DenseIndex nbRows, DenseIndex) : m_rows(nbRows) {}
-    inline void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
-    inline DenseIndex rows(void) const {return m_rows;}
-    inline DenseIndex cols(void) const {return _Cols;}
-    inline void conservativeResize(DenseIndex, DenseIndex nbRows, DenseIndex) { m_rows = nbRows; }
-    inline void resize(DenseIndex, DenseIndex nbRows, DenseIndex) { m_rows = nbRows; }
-    inline const T *data() const { return m_data.array; }
-    inline T *data() { return m_data.array; }
+    DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows) {}
+    DenseStorage& operator=(const DenseStorage& other) 
+    {
+      if (this != &other)
+      {
+        m_data = other.m_data;
+        m_rows = other.m_rows;
+      }
+      return *this; 
+    }
+    DenseStorage(DenseIndex, DenseIndex nbRows, DenseIndex) : m_rows(nbRows) {}
+    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    EIGEN_DEVICE_FUNC DenseIndex rows(void) const {return m_rows;}
+    EIGEN_DEVICE_FUNC DenseIndex cols(void) const {return _Cols;}
+    void conservativeResize(DenseIndex, DenseIndex nbRows, DenseIndex) { m_rows = nbRows; }
+    void resize(DenseIndex, DenseIndex nbRows, DenseIndex) { m_rows = nbRows; }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };

 // dynamic-size matrix with fixed-size storage and fixed height
@@ -199,17 +245,27 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
    internal::plain_array<T,Size,_Options> m_data;
    DenseIndex m_cols;
  public:
-    inline DenseStorage() : m_cols(0) {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC DenseStorage() : m_cols(0) {}
+    DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(0) {}
-    inline DenseStorage(DenseIndex, DenseIndex, DenseIndex nbCols) : m_cols(nbCols) {}
-    inline void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
-    inline DenseIndex rows(void) const {return _Rows;}
-    inline DenseIndex cols(void) const {return m_cols;}
-    inline void conservativeResize(DenseIndex, DenseIndex, DenseIndex nbCols) { m_cols = nbCols; }
-    inline void resize(DenseIndex, DenseIndex, DenseIndex nbCols) { m_cols = nbCols; }
-    inline const T *data() const { return m_data.array; }
-    inline T *data() { return m_data.array; }
+    DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_cols(other.m_cols) {}
+    DenseStorage& operator=(const DenseStorage& other)
+    {
+      if (this != &other)
+      {
+        m_data = other.m_data;
+        m_cols = other.m_cols;
+      }
+      return *this;
+    }
+    DenseStorage(DenseIndex, DenseIndex, DenseIndex nbCols) : m_cols(nbCols) {}
+    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    EIGEN_DEVICE_FUNC DenseIndex rows(void) const {return _Rows;}
+    EIGEN_DEVICE_FUNC DenseIndex cols(void) const {return m_cols;}
+    void conservativeResize(DenseIndex, DenseIndex, DenseIndex nbCols) { m_cols = nbCols; }
+    void resize(DenseIndex, DenseIndex, DenseIndex nbCols) { m_cols = nbCols; }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };

 // purely dynamic matrix.
@@ -219,18 +275,51 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
    DenseIndex m_rows;
    DenseIndex m_cols;
  public:
-    inline DenseStorage() : m_data(0), m_rows(0), m_cols(0) {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0), m_cols(0) {}
+    DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(0), m_rows(0), m_cols(0) {}
-    inline DenseStorage(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
+    DenseStorage(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows), m_cols(nbCols)
    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-    inline ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
-    inline void swap(DenseStorage& other)
+    DenseStorage(const DenseStorage& other)
+      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*other.m_cols))
+      , m_rows(other.m_rows)
+      , m_cols(other.m_cols)
+    {
+      internal::smart_copy(other.m_data, other.m_data+other.m_rows*other.m_cols, m_data);
+    }
+    DenseStorage& operator=(const DenseStorage& other)
+    {
+      if (this != &other)
+      {
+        DenseStorage tmp(other);
+        this->swap(tmp);
+      }
+      return *this;
+    }
+#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+    DenseStorage(DenseStorage&& other)
+      : m_data(std::move(other.m_data))
+      , m_rows(std::move(other.m_rows))
+      , m_cols(std::move(other.m_cols))
+    {
+      other.m_data = nullptr;
+    }
+    DenseStorage& operator=(DenseStorage&& other)
+    {
+      using std::swap;
+      swap(m_data, other.m_data);
+      swap(m_rows, other.m_rows);
+      swap(m_cols, other.m_cols);
+      return *this;
+    }
+#endif
+    ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
+    void swap(DenseStorage& other)
    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
-    inline DenseIndex rows(void) const {return m_rows;}
-    inline DenseIndex cols(void) const {return m_cols;}
-    inline void conservativeResize(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
+    EIGEN_DEVICE_FUNC DenseIndex rows(void) const {return m_rows;}
+    EIGEN_DEVICE_FUNC DenseIndex cols(void) const {return m_cols;}
+    void conservativeResize(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
    {
      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*m_cols);
      m_rows = nbRows;
@@ -250,8 +339,8 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
      m_rows = nbRows;
      m_cols = nbCols;
    }
-    inline const T *data() const { return m_data; }
-    inline T *data() { return m_data; }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data; }
 };

 // matrix with dynamic width and fixed height (so that matrix has dynamic size).
@@ -260,15 +349,45 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
    T *m_data;
    DenseIndex m_cols;
  public:
-    inline DenseStorage() : m_data(0), m_cols(0) {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
-    inline DenseStorage(DenseIndex size, DenseIndex, DenseIndex nbCols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(nbCols)
+    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_cols(0) {}
+    DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
+    DenseStorage(DenseIndex size, DenseIndex, DenseIndex nbCols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(nbCols)
    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-    inline ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
-    inline void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
-    static inline DenseIndex rows(void) {return _Rows;}
-    inline DenseIndex cols(void) const {return m_cols;}
-    inline void conservativeResize(DenseIndex size, DenseIndex, DenseIndex nbCols)
+    DenseStorage(const DenseStorage& other)
+      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(_Rows*other.m_cols))
+      , m_cols(other.m_cols)
+    {
+      internal::smart_copy(other.m_data, other.m_data+_Rows*m_cols, m_data);
+    }
+    DenseStorage& operator=(const DenseStorage& other)
+    {
+      if (this != &other)
+      {
+        DenseStorage tmp(other);
+        this->swap(tmp);
+      }
+      return *this;
+    }    
+#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+    DenseStorage(DenseStorage&& other)
+      : m_data(std::move(other.m_data))
+      , m_cols(std::move(other.m_cols))
+    {
+      other.m_data = nullptr;
+    }
+    DenseStorage& operator=(DenseStorage&& other)
+    {
+      using std::swap;
+      swap(m_data, other.m_data);
+      swap(m_cols, other.m_cols);
+      return *this;
+    }
+#endif
+    ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
+    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    EIGEN_DEVICE_FUNC static DenseIndex rows(void) {return _Rows;}
+    EIGEN_DEVICE_FUNC DenseIndex cols(void) const {return m_cols;}
+    void conservativeResize(DenseIndex size, DenseIndex, DenseIndex nbCols)
    {
      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, _Rows*m_cols);
      m_cols = nbCols;
@@ -286,8 +405,8 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
      }
      m_cols = nbCols;
    }
-    inline const T *data() const { return m_data; }
-    inline T *data() { return m_data; }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data; }
 };

 // matrix with dynamic height and fixed width (so that matrix has dynamic size).
@@ -296,15 +415,45 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
    T *m_data;
    DenseIndex m_rows;
  public:
-    inline DenseStorage() : m_data(0), m_rows(0) {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
-    inline DenseStorage(DenseIndex size, DenseIndex nbRows, DenseIndex) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows)
+    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0) {}
+    DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
+    DenseStorage(DenseIndex size, DenseIndex nbRows, DenseIndex) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows)
    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-    inline ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
-    inline void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
-    inline DenseIndex rows(void) const {return m_rows;}
-    static inline DenseIndex cols(void) {return _Cols;}
-    inline void conservativeResize(DenseIndex size, DenseIndex nbRows, DenseIndex)
+    DenseStorage(const DenseStorage& other)
+      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*_Cols))
+      , m_rows(other.m_rows)
+    {
+      internal::smart_copy(other.m_data, other.m_data+other.m_rows*_Cols, m_data);
+    }
+    DenseStorage& operator=(const DenseStorage& other)
+    {
+      if (this != &other)
+      {
+        DenseStorage tmp(other);
+        this->swap(tmp);
+      }
+      return *this;
+    }    
+#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+    DenseStorage(DenseStorage&& other)
+      : m_data(std::move(other.m_data))
+      , m_rows(std::move(other.m_rows))
+    {
+      other.m_data = nullptr;
+    }
+    DenseStorage& operator=(DenseStorage&& other)
+    {
+      using std::swap;
+      swap(m_data, other.m_data);
+      swap(m_rows, other.m_rows);
+      return *this;
+    }
+#endif
+    ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
+    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    EIGEN_DEVICE_FUNC DenseIndex rows(void) const {return m_rows;}
+    EIGEN_DEVICE_FUNC static DenseIndex cols(void) {return _Cols;}
+    void conservativeResize(DenseIndex size, DenseIndex nbRows, DenseIndex)
    {
      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*_Cols);
      m_rows = nbRows;
@@ -322,8 +471,8 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
      }
      m_rows = nbRows;
    }
-    inline const T *data() const { return m_data; }
-    inline T *data() { return m_data; }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data; }
 };

 } // end namespace Eigen
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -70,20 +70,30 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
    typedef typename internal::dense_xpr_base<Diagonal>::type Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)

+    EIGEN_DEVICE_FUNC
    inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}

    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)

+    EIGEN_DEVICE_FUNC
    inline Index rows() const
-    { return m_index.value()<0 ? (std::min<Index>)(m_matrix.cols(),m_matrix.rows()+m_index.value()) : (std::min<Index>)(m_matrix.rows(),m_matrix.cols()-m_index.value()); }
+    {
+      EIGEN_USING_STD_MATH(min);
+      return m_index.value()<0 ? (min)(Index(m_matrix.cols()),Index(m_matrix.rows()+m_index.value()))
+                               : (min)(Index(m_matrix.rows()),Index(m_matrix.cols()-m_index.value()));
+      
+    }

+    EIGEN_DEVICE_FUNC
    inline Index cols() const { return 1; }

+    EIGEN_DEVICE_FUNC
    inline Index innerStride() const
    {
      return m_matrix.outerStride() + 1;
    }

+    EIGEN_DEVICE_FUNC
    inline Index outerStride() const
    {
      return 0;
@@ -95,47 +105,57 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
                       const Scalar
                     >::type ScalarWithConstIfNotLvalue;

+    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }
+    EIGEN_DEVICE_FUNC
    inline const Scalar* data() const { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }

+    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index row, Index)
    {
      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
      return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
    }

+    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index row, Index) const
    {
      return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
    }

+    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index row, Index) const
    {
      return m_matrix.coeff(row+rowOffset(), row+colOffset());
    }

+    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index idx)
    {
      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
      return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
    }

+    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index idx) const
    {
      return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
    }

+    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index idx) const
    {
      return m_matrix.coeff(idx+rowOffset(), idx+colOffset());
    }

+    EIGEN_DEVICE_FUNC
    const typename internal::remove_all<typename MatrixType::Nested>::type& 
    nestedExpression() const 
    {
      return m_matrix;
    }

+    EIGEN_DEVICE_FUNC
    int index() const
    {
      return m_index.value();
@@ -147,8 +167,11 @@ template<typename MatrixType, int _DiagIndex> class Diagonal

  private:
    // some compilers may fail to optimize std::max etc in case of compile-time constants...
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index absDiagIndex() const { return m_index.value()>0 ? m_index.value() : -m_index.value(); }
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value()>0 ? 0 : -m_index.value(); }
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value()>0 ? m_index.value() : 0; }
    // triger a compile time error is someone try to call packet
    template<int LoadMode> typename MatrixType::PacketReturnType packet(Index) const;
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -37,63 +37,64 @@ class DiagonalBase : public EigenBase<Derived>
    typedef DenseMatrixType DenseType;
    typedef DiagonalMatrix<Scalar,DiagonalVectorType::SizeAtCompileTime,DiagonalVectorType::MaxSizeAtCompileTime> PlainObject;

+    EIGEN_DEVICE_FUNC
    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    EIGEN_DEVICE_FUNC
    inline Derived& derived() { return *static_cast<Derived*>(this); }

+    EIGEN_DEVICE_FUNC
    DenseMatrixType toDenseMatrix() const { return derived(); }
    template<typename DenseDerived>
+    EIGEN_DEVICE_FUNC
    void evalTo(MatrixBase<DenseDerived> &other) const;
    template<typename DenseDerived>
+    EIGEN_DEVICE_FUNC
    void addTo(MatrixBase<DenseDerived> &other) const
    { other.diagonal() += diagonal(); }
    template<typename DenseDerived>
+    EIGEN_DEVICE_FUNC
    void subTo(MatrixBase<DenseDerived> &other) const
    { other.diagonal() -= diagonal(); }

+    EIGEN_DEVICE_FUNC
    inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
+    EIGEN_DEVICE_FUNC
    inline DiagonalVectorType& diagonal() { return derived().diagonal(); }

+    EIGEN_DEVICE_FUNC
    inline Index rows() const { return diagonal().size(); }
+    EIGEN_DEVICE_FUNC
    inline Index cols() const { return diagonal().size(); }

    /** \returns the diagonal matrix product of \c *this by the matrix \a matrix.
      */
    template<typename MatrixDerived>
+    EIGEN_DEVICE_FUNC
    const DiagonalProduct<MatrixDerived, Derived, OnTheLeft>
    operator*(const MatrixBase<MatrixDerived> &matrix) const
    {
      return DiagonalProduct<MatrixDerived, Derived, OnTheLeft>(matrix.derived(), derived());
    }

+    EIGEN_DEVICE_FUNC
    inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const DiagonalVectorType> >
    inverse() const
    {
      return diagonal().cwiseInverse();
    }
    
+    EIGEN_DEVICE_FUNC
    inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> >
    operator*(const Scalar& scalar) const
    {
      return diagonal() * scalar;
    }
+    EIGEN_DEVICE_FUNC
    friend inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> >
    operator*(const Scalar& scalar, const DiagonalBase& other)
    {
      return other.diagonal() * scalar;
    }
-    
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived>
-    bool isApprox(const DiagonalBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
-    {
-      return diagonal().isApprox(other.diagonal(), precision);
-    }
-    template<typename OtherDerived>
-    bool isApprox(const MatrixBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
-    {
-      return toDenseMatrix().isApprox(other, precision);
-    }
-    #endif
 };

 template<typename Derived>
@@ -151,24 +152,31 @@ class DiagonalMatrix
  public:

    /** const version of diagonal(). */
+    EIGEN_DEVICE_FUNC
    inline const DiagonalVectorType& diagonal() const { return m_diagonal; }
    /** \returns a reference to the stored vector of diagonal coefficients. */
+    EIGEN_DEVICE_FUNC
    inline DiagonalVectorType& diagonal() { return m_diagonal; }

    /** Default constructor without initialization */
+    EIGEN_DEVICE_FUNC
    inline DiagonalMatrix() {}

    /** Constructs a diagonal matrix with given dimension  */
+    EIGEN_DEVICE_FUNC
    inline DiagonalMatrix(Index dim) : m_diagonal(dim) {}

    /** 2D constructor. */
+    EIGEN_DEVICE_FUNC
    inline DiagonalMatrix(const Scalar& x, const Scalar& y) : m_diagonal(x,y) {}

    /** 3D constructor. */
+    EIGEN_DEVICE_FUNC
    inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x,y,z) {}

    /** Copy constructor. */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    inline DiagonalMatrix(const DiagonalBase<OtherDerived>& other) : m_diagonal(other.diagonal()) {}

    #ifndef EIGEN_PARSED_BY_DOXYGEN
@@ -178,11 +186,13 @@ class DiagonalMatrix

    /** generic constructor from expression of the diagonal coefficients */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    explicit inline DiagonalMatrix(const MatrixBase<OtherDerived>& other) : m_diagonal(other)
    {}

    /** Copy operator. */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    DiagonalMatrix& operator=(const DiagonalBase<OtherDerived>& other)
    {
      m_diagonal = other.diagonal();
@@ -193,6 +203,7 @@ class DiagonalMatrix
    /** This is a special case of the templated operator=. Its purpose is to
      * prevent a default operator= from hiding the templated operator=.
      */
+    EIGEN_DEVICE_FUNC
    DiagonalMatrix& operator=(const DiagonalMatrix& other)
    {
      m_diagonal = other.diagonal();
@@ -201,14 +212,19 @@ class DiagonalMatrix
    #endif

    /** Resizes to given size. */
+    EIGEN_DEVICE_FUNC
    inline void resize(Index size) { m_diagonal.resize(size); }
    /** Sets all coefficients to zero. */
+    EIGEN_DEVICE_FUNC
    inline void setZero() { m_diagonal.setZero(); }
    /** Resizes and sets all coefficients to zero. */
+    EIGEN_DEVICE_FUNC
    inline void setZero(Index size) { m_diagonal.setZero(size); }
    /** Sets this matrix to be the identity matrix of the current size. */
+    EIGEN_DEVICE_FUNC
    inline void setIdentity() { m_diagonal.setOnes(); }
    /** Sets this matrix to be the identity matrix of the given size. */
+    EIGEN_DEVICE_FUNC
    inline void setIdentity(Index size) { m_diagonal.setOnes(size); }
 };

@@ -255,9 +271,11 @@ class DiagonalWrapper
    #endif

    /** Constructor from expression of diagonal coefficients to wrap. */
+    EIGEN_DEVICE_FUNC
    inline DiagonalWrapper(DiagonalVectorType& a_diagonal) : m_diagonal(a_diagonal) {}

    /** \returns a const reference to the wrapped expression of diagonal coefficients. */
+    EIGEN_DEVICE_FUNC
    const DiagonalVectorType& diagonal() const { return m_diagonal; }

  protected:
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -29,6 +29,7 @@ template<typename T, typename U,
 struct dot_nocheck
 {
  typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
+  EIGEN_DEVICE_FUNC
  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
  {
    return a.template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
@@ -39,6 +40,7 @@ template<typename T, typename U>
 struct dot_nocheck<T, U, true>
 {
  typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
+  EIGEN_DEVICE_FUNC
  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
  {
    return a.transpose().template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
@@ -59,6 +61,7 @@ struct dot_nocheck<T, U, true>
  */
 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
 MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
 {
@@ -73,34 +76,6 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
  return internal::dot_nocheck<Derived,OtherDerived>::run(*this, other);
 }

-#ifdef EIGEN2_SUPPORT
-/** \returns the dot product of *this with other, with the Eigen2 convention that the dot product is linear in the first variable
-  * (conjugating the second variable). Of course this only makes a difference in the complex case.
-  *
-  * This method is only available in EIGEN2_SUPPORT mode.
-  *
-  * \only_for_vectors
-  *
-  * \sa dot()
-  */
-template<typename Derived>
-template<typename OtherDerived>
-typename internal::traits<Derived>::Scalar
-MatrixBase<Derived>::eigen2_dot(const MatrixBase<OtherDerived>& other) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
-  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived,OtherDerived)
-  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
-    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-
-  eigen_assert(size() == other.size());
-
-  return internal::dot_nocheck<OtherDerived,Derived>::run(other,*this);
-}
-#endif
-
-
 //---------- implementation of L2 norm and related functions ----------

 /** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the Frobenius norm.
@@ -164,6 +139,7 @@ template<typename Derived, int p>
 struct lpNorm_selector
 {
  typedef typename NumTraits<typename traits<Derived>::Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const MatrixBase<Derived>& m)
  {
    using std::pow;
@@ -174,6 +150,7 @@ struct lpNorm_selector
 template<typename Derived>
 struct lpNorm_selector<Derived, 1>
 {
+  EIGEN_DEVICE_FUNC
  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
  {
    return m.cwiseAbs().sum();
@@ -183,6 +160,7 @@ struct lpNorm_selector<Derived, 1>
 template<typename Derived>
 struct lpNorm_selector<Derived, 2>
 {
+  EIGEN_DEVICE_FUNC
  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
  {
    return m.norm();
@@ -192,6 +170,7 @@ struct lpNorm_selector<Derived, 2>
 template<typename Derived>
 struct lpNorm_selector<Derived, Infinity>
 {
+  EIGEN_DEVICE_FUNC
  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
  {
    return m.cwiseAbs().maxCoeff();
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@@ -31,29 +31,40 @@ template<typename Derived> struct EigenBase
  typedef typename internal::traits<Derived>::Index Index;

  /** \returns a reference to the derived object */
+  EIGEN_DEVICE_FUNC
  Derived& derived() { return *static_cast<Derived*>(this); }
  /** \returns a const reference to the derived object */
+  EIGEN_DEVICE_FUNC
  const Derived& derived() const { return *static_cast<const Derived*>(this); }

+  EIGEN_DEVICE_FUNC
  inline Derived& const_cast_derived() const
  { return *static_cast<Derived*>(const_cast<EigenBase*>(this)); }
+  EIGEN_DEVICE_FUNC
  inline const Derived& const_derived() const
  { return *static_cast<const Derived*>(this); }

  /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
+  EIGEN_DEVICE_FUNC
  inline Index rows() const { return derived().rows(); }
  /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
+  EIGEN_DEVICE_FUNC
  inline Index cols() const { return derived().cols(); }
  /** \returns the number of coefficients, which is rows()*cols().
    * \sa rows(), cols(), SizeAtCompileTime. */
+  EIGEN_DEVICE_FUNC
  inline Index size() const { return rows() * cols(); }

  /** \internal Don't use it, but do the equivalent: \code dst = *this; \endcode */
-  template<typename Dest> inline void evalTo(Dest& dst) const
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC
+  inline void evalTo(Dest& dst) const
  { derived().evalTo(dst); }

  /** \internal Don't use it, but do the equivalent: \code dst += *this; \endcode */
-  template<typename Dest> inline void addTo(Dest& dst) const
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC
+  inline void addTo(Dest& dst) const
  {
    // This is the default implementation,
    // derived class can reimplement it in a more optimized way.
@@ -63,7 +74,9 @@ template<typename Derived> struct EigenBase
  }

  /** \internal Don't use it, but do the equivalent: \code dst -= *this; \endcode */
-  template<typename Dest> inline void subTo(Dest& dst) const
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC
+  inline void subTo(Dest& dst) const
  {
    // This is the default implementation,
    // derived class can reimplement it in a more optimized way.
@@ -73,7 +86,8 @@ template<typename Derived> struct EigenBase
  }

  /** \internal Don't use it, but do the equivalent: \code dst.applyOnTheRight(*this); \endcode */
-  template<typename Dest> inline void applyThisOnTheRight(Dest& dst) const
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC inline void applyThisOnTheRight(Dest& dst) const
  {
    // This is the default implementation,
    // derived class can reimplement it in a more optimized way.
@@ -81,7 +95,8 @@ template<typename Derived> struct EigenBase
  }

  /** \internal Don't use it, but do the equivalent: \code dst.applyOnTheLeft(*this); \endcode */
-  template<typename Dest> inline void applyThisOnTheLeft(Dest& dst) const
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC inline void applyThisOnTheLeft(Dest& dst) const
  {
    // This is the default implementation,
    // derived class can reimplement it in a more optimized way.
@@ -126,36 +141,6 @@ Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
  return derived();
 }

-/** replaces \c *this by \c *this * \a other.
-  *
-  * \returns a reference to \c *this
-  */
-template<typename Derived>
-template<typename OtherDerived>
-inline Derived&
-MatrixBase<Derived>::operator*=(const EigenBase<OtherDerived> &other)
-{
-  other.derived().applyThisOnTheRight(derived());
-  return derived();
-}
-
-/** replaces \c *this by \c *this * \a other. It is equivalent to MatrixBase::operator*=().
-  */
-template<typename Derived>
-template<typename OtherDerived>
-inline void MatrixBase<Derived>::applyOnTheRight(const EigenBase<OtherDerived> &other)
-{
-  other.derived().applyThisOnTheRight(derived());
-}
-
-/** replaces \c *this by \c *this * \a other. */
-template<typename Derived>
-template<typename OtherDerived>
-inline void MatrixBase<Derived>::applyOnTheLeft(const EigenBase<OtherDerived> &other)
-{
-  other.derived().applyThisOnTheLeft(derived());
-}
-
 } // end namespace Eigen

 #endif // EIGEN_EIGENBASE_H
--- a/Eigen/src/Core/Functors.h
+++ b/Eigen/src/Core/Functors.h
@@ -1,985 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_FUNCTORS_H
-#define EIGEN_FUNCTORS_H
-
-namespace Eigen {
-
-namespace internal {
-
-// associative functors:
-
-/** \internal
-  * \brief Template functor to compute the sum of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::operator+, class VectorwiseOp, MatrixBase::sum()
-  */
-template<typename Scalar> struct scalar_sum_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a + b; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::padd(a,b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
-  { return internal::predux(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_sum_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasAdd
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the product of two scalars
-  *
-  * \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux()
-  */
-template<typename LhsScalar,typename RhsScalar> struct scalar_product_op {
-  enum {
-    // TODO vectorize mixed product
-    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
-  };
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
-  EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmul(a,b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
-  { return internal::predux_mul(a); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost)/2, // rough estimate!
-    PacketAccess = scalar_product_op<LhsScalar,RhsScalar>::Vectorizable
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the conjugate product of two scalars
-  *
-  * This is a short cut for conj(x) * y which is needed for optimization purpose; in Eigen2 support mode, this becomes x * conj(y)
-  */
-template<typename LhsScalar,typename RhsScalar> struct scalar_conj_product_op {
-
-  enum {
-    Conj = NumTraits<LhsScalar>::IsComplex
-  };
-  
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
-  
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
-  EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
-  { return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
-  
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_conj_product_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = NumTraits<LhsScalar>::MulCost,
-    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMul
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the min of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff()
-  */
-template<typename Scalar> struct scalar_min_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { using std::min; return (min)(a, b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmin(a,b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
-  { return internal::predux_min(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_min_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasMin
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the max of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff()
-  */
-template<typename Scalar> struct scalar_max_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { using std::max; return (max)(a, b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmax(a,b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
-  { return internal::predux_max(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_max_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasMax
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the hypot of two scalars
-  *
-  * \sa MatrixBase::stableNorm(), class Redux
-  */
-template<typename Scalar> struct scalar_hypot_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
-//   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const
-  {
-    using std::max;
-    using std::min;
-    using std::sqrt;
-    Scalar p = (max)(_x, _y);
-    Scalar q = (min)(_x, _y);
-    Scalar qp = q/p;
-    return p * sqrt(Scalar(1) + qp*qp);
-  }
-};
-template<typename Scalar>
-struct functor_traits<scalar_hypot_op<Scalar> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess=0 };
-};
-
-/** \internal
-  * \brief Template functor to compute the pow of two scalars
-  */
-template<typename Scalar, typename OtherScalar> struct scalar_binary_pow_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_binary_pow_op)
-  inline Scalar operator() (const Scalar& a, const OtherScalar& b) const { return numext::pow(a, b); }
-};
-template<typename Scalar, typename OtherScalar>
-struct functor_traits<scalar_binary_pow_op<Scalar,OtherScalar> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
-};
-
-// other binary functors:
-
-/** \internal
-  * \brief Template functor to compute the difference of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::operator-
-  */
-template<typename Scalar> struct scalar_difference_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a - b; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::psub(a,b); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_difference_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasSub
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the quotient of two scalars
-  *
-  * \sa class CwiseBinaryOp, Cwise::operator/()
-  */
-template<typename LhsScalar,typename RhsScalar> struct scalar_quotient_op {
-  enum {
-    // TODO vectorize mixed product
-    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasDiv && packet_traits<RhsScalar>::HasDiv
-  };
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
-  EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pdiv(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost), // rough estimate!
-    PacketAccess = scalar_quotient_op<LhsScalar,RhsScalar>::Vectorizable
-  };
-};
-
-
-
-/** \internal
-  * \brief Template functor to compute the and of two booleans
-  *
-  * \sa class CwiseBinaryOp, ArrayBase::operator&&
-  */
-struct scalar_boolean_and_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op)
-  EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; }
-};
-template<> struct functor_traits<scalar_boolean_and_op> {
-  enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the or of two booleans
-  *
-  * \sa class CwiseBinaryOp, ArrayBase::operator||
-  */
-struct scalar_boolean_or_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op)
-  EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; }
-};
-template<> struct functor_traits<scalar_boolean_or_op> {
-  enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
-  };
-};
-
-// unary functors:
-
-/** \internal
-  * \brief Template functor to compute the opposite of a scalar
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::operator-
-  */
-template<typename Scalar> struct scalar_opposite_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_opposite_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return -a; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pnegate(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_opposite_op<Scalar> >
-{ enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasNegate };
-};
-
-/** \internal
-  * \brief Template functor to compute the absolute value of a scalar
-  *
-  * \sa class CwiseUnaryOp, Cwise::abs
-  */
-template<typename Scalar> struct scalar_abs_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_abs_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using std::abs; return abs(a); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pabs(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_abs_op<Scalar> >
-{
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasAbs
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the squared absolute value of a scalar
-  *
-  * \sa class CwiseUnaryOp, Cwise::abs2
-  */
-template<typename Scalar> struct scalar_abs2_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_abs2_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::abs2(a); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a,a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_abs2_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasAbs2 }; };
-
-/** \internal
-  * \brief Template functor to compute the conjugate of a complex value
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::conjugate()
-  */
-template<typename Scalar> struct scalar_conjugate_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_conjugate_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::conj; return conj(a); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pconj(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_conjugate_op<Scalar> >
-{
-  enum {
-    Cost = NumTraits<Scalar>::IsComplex ? NumTraits<Scalar>::AddCost : 0,
-    PacketAccess = packet_traits<Scalar>::HasConj
-  };
-};
-
-/** \internal
-  * \brief Template functor to cast a scalar to another type
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::cast()
-  */
-template<typename Scalar, typename NewType>
-struct scalar_cast_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
-  typedef NewType result_type;
-  EIGEN_STRONG_INLINE const NewType operator() (const Scalar& a) const { return cast<Scalar, NewType>(a); }
-};
-template<typename Scalar, typename NewType>
-struct functor_traits<scalar_cast_op<Scalar,NewType> >
-{ enum { Cost = is_same<Scalar, NewType>::value ? 0 : NumTraits<NewType>::AddCost, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to extract the real part of a complex
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::real()
-  */
-template<typename Scalar>
-struct scalar_real_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_real_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::real(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_real_op<Scalar> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to extract the imaginary part of a complex
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::imag()
-  */
-template<typename Scalar>
-struct scalar_imag_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::imag(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_imag_op<Scalar> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to extract the real part of a complex as a reference
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::real()
-  */
-template<typename Scalar>
-struct scalar_real_ref_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_real_ref_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::real_ref(*const_cast<Scalar*>(&a)); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_real_ref_op<Scalar> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to extract the imaginary part of a complex as a reference
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::imag()
-  */
-template<typename Scalar>
-struct scalar_imag_ref_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_ref_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::imag_ref(*const_cast<Scalar*>(&a)); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_imag_ref_op<Scalar> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-/** \internal
-  *
-  * \brief Template functor to compute the exponential of a scalar
-  *
-  * \sa class CwiseUnaryOp, Cwise::exp()
-  */
-template<typename Scalar> struct scalar_exp_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_exp_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::exp; return exp(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::pexp(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_exp_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasExp }; };
-
-/** \internal
-  *
-  * \brief Template functor to compute the logarithm of a scalar
-  *
-  * \sa class CwiseUnaryOp, Cwise::log()
-  */
-template<typename Scalar> struct scalar_log_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_log_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::log; return log(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::plog(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_log_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog }; };
-
-/** \internal
-  * \brief Template functor to multiply a scalar by a fixed other one
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::operator*, MatrixBase::operator/
-  */
-/* NOTE why doing the pset1() in packetOp *is* an optimization ?
- * indeed it seems better to declare m_other as a Packet and do the pset1() once
- * in the constructor. However, in practice:
- *  - GCC does not like m_other as a Packet and generate a load every time it needs it
- *  - on the other hand GCC is able to moves the pset1() outside the loop :)
- *  - simpler code ;)
- * (ICC and gcc 4.4 seems to perform well in both cases, the issue is visible with y = a*x + b*y)
- */
-template<typename Scalar>
-struct scalar_multiple_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  // FIXME default copy constructors seems bugged with std::complex<>
-  EIGEN_STRONG_INLINE scalar_multiple_op(const scalar_multiple_op& other) : m_other(other.m_other) { }
-  EIGEN_STRONG_INLINE scalar_multiple_op(const Scalar& other) : m_other(other) { }
-  EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a * m_other; }
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a, pset1<Packet>(m_other)); }
-  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_multiple_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
-
-template<typename Scalar1, typename Scalar2>
-struct scalar_multiple2_op {
-  typedef typename scalar_product_traits<Scalar1,Scalar2>::ReturnType result_type;
-  EIGEN_STRONG_INLINE scalar_multiple2_op(const scalar_multiple2_op& other) : m_other(other.m_other) { }
-  EIGEN_STRONG_INLINE scalar_multiple2_op(const Scalar2& other) : m_other(other) { }
-  EIGEN_STRONG_INLINE result_type operator() (const Scalar1& a) const { return a * m_other; }
-  typename add_const_on_value_type<typename NumTraits<Scalar2>::Nested>::type m_other;
-};
-template<typename Scalar1,typename Scalar2>
-struct functor_traits<scalar_multiple2_op<Scalar1,Scalar2> >
-{ enum { Cost = NumTraits<Scalar1>::MulCost, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to divide a scalar by a fixed other one
-  *
-  * This functor is used to implement the quotient of a matrix by
-  * a scalar where the scalar type is not necessarily a floating point type.
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::operator/
-  */
-template<typename Scalar>
-struct scalar_quotient1_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  // FIXME default copy constructors seems bugged with std::complex<>
-  EIGEN_STRONG_INLINE scalar_quotient1_op(const scalar_quotient1_op& other) : m_other(other.m_other) { }
-  EIGEN_STRONG_INLINE scalar_quotient1_op(const Scalar& other) : m_other(other) {}
-  EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a / m_other; }
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pdiv(a, pset1<Packet>(m_other)); }
-  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_quotient1_op<Scalar> >
-{ enum { Cost = 2 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
-
-// nullary functors
-
-template<typename Scalar>
-struct scalar_constant_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_STRONG_INLINE scalar_constant_op(const scalar_constant_op& other) : m_other(other.m_other) { }
-  EIGEN_STRONG_INLINE scalar_constant_op(const Scalar& other) : m_other(other) { }
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index, Index = 0) const { return m_other; }
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index, Index = 0) const { return internal::pset1<Packet>(m_other); }
-  const Scalar m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_constant_op<Scalar> >
-// FIXME replace this packet test by a safe one
-{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::Vectorizable, IsRepeatable = true }; };
-
-template<typename Scalar> struct scalar_identity_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_identity_op)
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const { return row==col ? Scalar(1) : Scalar(0); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_identity_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };
-
-template <typename Scalar, bool RandomAccess> struct linspaced_op_impl;
-
-// linear access for packet ops:
-// 1) initialization
-//   base = [low, ..., low] + ([step, ..., step] * [-size, ..., 0])
-// 2) each step (where size is 1 for coeff access or PacketSize for packet access)
-//   base += [size*step, ..., size*step]
-//
-// TODO: Perhaps it's better to initialize lazily (so not in the constructor but in packetOp)
-//       in order to avoid the padd() in operator() ?
-template <typename Scalar>
-struct linspaced_op_impl<Scalar,false>
-{
-  typedef typename packet_traits<Scalar>::type Packet;
-
-  linspaced_op_impl(const Scalar& low, const Scalar& step) :
-  m_low(low), m_step(step),
-  m_packetStep(pset1<Packet>(packet_traits<Scalar>::size*step)),
-  m_base(padd(pset1<Packet>(low), pmul(pset1<Packet>(step),plset<Scalar>(-packet_traits<Scalar>::size)))) {}
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index i) const 
-  { 
-    m_base = padd(m_base, pset1<Packet>(m_step));
-    return m_low+Scalar(i)*m_step; 
-  }
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index) const { return m_base = padd(m_base,m_packetStep); }
-
-  const Scalar m_low;
-  const Scalar m_step;
-  const Packet m_packetStep;
-  mutable Packet m_base;
-};
-
-// random access for packet ops:
-// 1) each step
-//   [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
-template <typename Scalar>
-struct linspaced_op_impl<Scalar,true>
-{
-  typedef typename packet_traits<Scalar>::type Packet;
-
-  linspaced_op_impl(const Scalar& low, const Scalar& step) :
-  m_low(low), m_step(step),
-  m_lowPacket(pset1<Packet>(m_low)), m_stepPacket(pset1<Packet>(m_step)), m_interPacket(plset<Scalar>(0)) {}
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; }
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index i) const
-  { return internal::padd(m_lowPacket, pmul(m_stepPacket, padd(pset1<Packet>(i),m_interPacket))); }
-
-  const Scalar m_low;
-  const Scalar m_step;
-  const Packet m_lowPacket;
-  const Packet m_stepPacket;
-  const Packet m_interPacket;
-};
-
-// ----- Linspace functor ----------------------------------------------------------------
-
-// Forward declaration (we default to random access which does not really give
-// us a speed gain when using packet access but it allows to use the functor in
-// nested expressions).
-template <typename Scalar, bool RandomAccess = true> struct linspaced_op;
-template <typename Scalar, bool RandomAccess> struct functor_traits< linspaced_op<Scalar,RandomAccess> >
-{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::HasSetLinear, IsRepeatable = true }; };
-template <typename Scalar, bool RandomAccess> struct linspaced_op
-{
-  typedef typename packet_traits<Scalar>::type Packet;
-  linspaced_op(const Scalar& low, const Scalar& high, DenseIndex num_steps) : impl((num_steps==1 ? high : low), (num_steps==1 ? Scalar() : (high-low)/(num_steps-1))) {}
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return impl(i); }
-
-  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
-  // there row==0 and col is used for the actual iteration.
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const 
-  {
-    eigen_assert(col==0 || row==0);
-    return impl(col + row);
-  }
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index i) const { return impl.packetOp(i); }
-
-  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
-  // there row==0 and col is used for the actual iteration.
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index row, Index col) const
-  {
-    eigen_assert(col==0 || row==0);
-    return impl.packetOp(col + row);
-  }
-
-  // This proxy object handles the actual required temporaries, the different
-  // implementations (random vs. sequential access) as well as the
-  // correct piping to size 2/4 packet operations.
-  const linspaced_op_impl<Scalar,RandomAccess> impl;
-};
-
-// all functors allow linear access, except scalar_identity_op. So we fix here a quick meta
-// to indicate whether a functor allows linear access, just always answering 'yes' except for
-// scalar_identity_op.
-// FIXME move this to functor_traits adding a functor_default
-template<typename Functor> struct functor_has_linear_access { enum { ret = 1 }; };
-template<typename Scalar> struct functor_has_linear_access<scalar_identity_op<Scalar> > { enum { ret = 0 }; };
-
-// In Eigen, any binary op (Product, CwiseBinaryOp) require the Lhs and Rhs to have the same scalar type, except for multiplication
-// where the mixing of different types is handled by scalar_product_traits
-// In particular, real * complex<real> is allowed.
-// FIXME move this to functor_traits adding a functor_default
-template<typename Functor> struct functor_is_product_like { enum { ret = 0 }; };
-template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
-template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_conj_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
-template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_quotient_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
-
-
-/** \internal
-  * \brief Template functor to add a scalar to a fixed other one
-  * \sa class CwiseUnaryOp, Array::operator+
-  */
-/* If you wonder why doing the pset1() in packetOp() is an optimization check scalar_multiple_op */
-template<typename Scalar>
-struct scalar_add_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  // FIXME default copy constructors seems bugged with std::complex<>
-  inline scalar_add_op(const scalar_add_op& other) : m_other(other.m_other) { }
-  inline scalar_add_op(const Scalar& other) : m_other(other) { }
-  inline Scalar operator() (const Scalar& a) const { return a + m_other; }
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::padd(a, pset1<Packet>(m_other)); }
-  const Scalar m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_add_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
-
-/** \internal
-  * \brief Template functor to compute the square root of a scalar
-  * \sa class CwiseUnaryOp, Cwise::sqrt()
-  */
-template<typename Scalar> struct scalar_sqrt_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::sqrt; return sqrt(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_sqrt_op<Scalar> >
-{ enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasSqrt
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the cosine of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::cos()
-  */
-template<typename Scalar> struct scalar_cos_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cos_op)
-  inline Scalar operator() (const Scalar& a) const { using std::cos; return cos(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::pcos(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_cos_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasCos
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the sine of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::sin()
-  */
-template<typename Scalar> struct scalar_sin_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sin_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::sin; return sin(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::psin(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_sin_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasSin
-  };
-};
-
-
-/** \internal
-  * \brief Template functor to compute the tan of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::tan()
-  */
-template<typename Scalar> struct scalar_tan_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_tan_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::tan; return tan(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::ptan(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_tan_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasTan
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the arc cosine of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::acos()
-  */
-template<typename Scalar> struct scalar_acos_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_acos_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::acos; return acos(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::pacos(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_acos_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasACos
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the arc sine of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::asin()
-  */
-template<typename Scalar> struct scalar_asin_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_asin_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::asin; return asin(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::pasin(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_asin_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasASin
-  };
-};
-
-/** \internal
-  * \brief Template functor to raise a scalar to a power
-  * \sa class CwiseUnaryOp, Cwise::pow
-  */
-template<typename Scalar>
-struct scalar_pow_op {
-  // FIXME default copy constructors seems bugged with std::complex<>
-  inline scalar_pow_op(const scalar_pow_op& other) : m_exponent(other.m_exponent) { }
-  inline scalar_pow_op(const Scalar& exponent) : m_exponent(exponent) {}
-  inline Scalar operator() (const Scalar& a) const { return numext::pow(a, m_exponent); }
-  const Scalar m_exponent;
-};
-template<typename Scalar>
-struct functor_traits<scalar_pow_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to compute the quotient between a scalar and array entries.
-  * \sa class CwiseUnaryOp, Cwise::inverse()
-  */
-template<typename Scalar>
-struct scalar_inverse_mult_op {
-  scalar_inverse_mult_op(const Scalar& other) : m_other(other) {}
-  inline Scalar operator() (const Scalar& a) const { return m_other / a; }
-  template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::pdiv(pset1<Packet>(m_other),a); }
-  Scalar m_other;
-};
-
-/** \internal
-  * \brief Template functor to compute the inverse of a scalar
-  * \sa class CwiseUnaryOp, Cwise::inverse()
-  */
-template<typename Scalar>
-struct scalar_inverse_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_inverse_op)
-  inline Scalar operator() (const Scalar& a) const { return Scalar(1)/a; }
-  template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::pdiv(pset1<Packet>(Scalar(1)),a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_inverse_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
-
-/** \internal
-  * \brief Template functor to compute the square of a scalar
-  * \sa class CwiseUnaryOp, Cwise::square()
-  */
-template<typename Scalar>
-struct scalar_square_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_square_op)
-  inline Scalar operator() (const Scalar& a) const { return a*a; }
-  template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a,a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_square_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
-
-/** \internal
-  * \brief Template functor to compute the cube of a scalar
-  * \sa class CwiseUnaryOp, Cwise::cube()
-  */
-template<typename Scalar>
-struct scalar_cube_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cube_op)
-  inline Scalar operator() (const Scalar& a) const { return a*a*a; }
-  template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a,pmul(a,a)); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_cube_op<Scalar> >
-{ enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
-
-// default functor traits for STL functors:
-
-template<typename T>
-struct functor_traits<std::multiplies<T> >
-{ enum { Cost = NumTraits<T>::MulCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::divides<T> >
-{ enum { Cost = NumTraits<T>::MulCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::plus<T> >
-{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::minus<T> >
-{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::negate<T> >
-{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::logical_or<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::logical_and<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::logical_not<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::greater<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::less<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::greater_equal<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::less_equal<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::equal_to<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::not_equal_to<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::binder2nd<T> >
-{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::binder1st<T> >
-{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::unary_negate<T> >
-{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::binary_negate<T> >
-{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
-
-#ifdef EIGEN_STDEXT_SUPPORT
-
-template<typename T0,typename T1>
-struct functor_traits<std::project1st<T0,T1> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-template<typename T0,typename T1>
-struct functor_traits<std::project2nd<T0,T1> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-template<typename T0,typename T1>
-struct functor_traits<std::select2nd<std::pair<T0,T1> > >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-template<typename T0,typename T1>
-struct functor_traits<std::select1st<std::pair<T0,T1> > >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-template<typename T0,typename T1>
-struct functor_traits<std::unary_compose<T0,T1> >
-{ enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost, PacketAccess = false }; };
-
-template<typename T0,typename T1,typename T2>
-struct functor_traits<std::binary_compose<T0,T1,T2> >
-{ enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost + functor_traits<T2>::Cost, PacketAccess = false }; };
-
-#endif // EIGEN_STDEXT_SUPPORT
-
-// allow to add new functors and specializations of functor_traits from outside Eigen.
-// this macro is really needed because functor_traits must be specialized after it is declared but before it is used...
-#ifdef EIGEN_FUNCTORS_PLUGIN
-#include EIGEN_FUNCTORS_PLUGIN
-#endif
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_FUNCTORS_H
--- a/Eigen/src/Core/Fuzzy.h
+++ b/Eigen/src/Core/Fuzzy.h
@@ -19,9 +19,10 @@ namespace internal
 template<typename Derived, typename OtherDerived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
 struct isApprox_selector
 {
+  EIGEN_DEVICE_FUNC
  static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar& prec)
  {
-    using std::min;
+    EIGEN_USING_STD_MATH(min);
    typename internal::nested<Derived,2>::type nested(x);
    typename internal::nested<OtherDerived,2>::type otherNested(y);
    return (nested - otherNested).cwiseAbs2().sum() <= prec * prec * (min)(nested.cwiseAbs2().sum(), otherNested.cwiseAbs2().sum());
@@ -31,6 +32,7 @@ struct isApprox_selector
 template<typename Derived, typename OtherDerived>
 struct isApprox_selector<Derived, OtherDerived, true>
 {
+  EIGEN_DEVICE_FUNC
  static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar&)
  {
    return x.matrix() == y.matrix();
@@ -40,6 +42,7 @@ struct isApprox_selector<Derived, OtherDerived, true>
 template<typename Derived, typename OtherDerived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
 struct isMuchSmallerThan_object_selector
 {
+  EIGEN_DEVICE_FUNC
  static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar& prec)
  {
    return x.cwiseAbs2().sum() <= numext::abs2(prec) * y.cwiseAbs2().sum();
@@ -49,6 +52,7 @@ struct isMuchSmallerThan_object_selector
 template<typename Derived, typename OtherDerived>
 struct isMuchSmallerThan_object_selector<Derived, OtherDerived, true>
 {
+  EIGEN_DEVICE_FUNC
  static bool run(const Derived& x, const OtherDerived&, const typename Derived::RealScalar&)
  {
    return x.matrix() == Derived::Zero(x.rows(), x.cols()).matrix();
@@ -58,6 +62,7 @@ struct isMuchSmallerThan_object_selector<Derived, OtherDerived, true>
 template<typename Derived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
 struct isMuchSmallerThan_scalar_selector
 {
+  EIGEN_DEVICE_FUNC
  static bool run(const Derived& x, const typename Derived::RealScalar& y, const typename Derived::RealScalar& prec)
  {
    return x.cwiseAbs2().sum() <= numext::abs2(prec * y);
@@ -67,6 +72,7 @@ struct isMuchSmallerThan_scalar_selector
 template<typename Derived>
 struct isMuchSmallerThan_scalar_selector<Derived, true>
 {
+  EIGEN_DEVICE_FUNC
  static bool run(const Derived& x, const typename Derived::RealScalar&, const typename Derived::RealScalar&)
  {
    return x.matrix() == Derived::Zero(x.rows(), x.cols()).matrix();
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -66,8 +66,7 @@ template<typename Lhs, typename Rhs> struct product_type
    MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::MaxColsAtCompileTime,
                                           _Rhs::MaxRowsAtCompileTime),
    Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime,
-                                        _Rhs::RowsAtCompileTime),
-    LargeThreshold = EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+                                        _Rhs::RowsAtCompileTime)
  };

  // the splitting into different lines of code here, introducing the _select enums and the typedef below,
@@ -232,7 +231,7 @@ EIGEN_DONT_INLINE void outer_product_selector_run(const ProductType& prod, Dest&
  // FIXME not very good if rhs is real and lhs complex while alpha is real too
  const Index cols = dest.cols();
  for (Index j=0; j<cols; ++j)
-    func(dest.col(j), prod.rhs().coeff(j) * prod.lhs());
+    func(dest.col(j), prod.rhs().coeff(0,j) * prod.lhs());
 }

 // Row major
@@ -243,7 +242,7 @@ EIGEN_DONT_INLINE void outer_product_selector_run(const ProductType& prod, Dest&
  // FIXME not very good if lhs is real and rhs complex while alpha is real too
  const Index rows = dest.rows();
  for (Index i=0; i<rows; ++i)
-    func(dest.row(i), prod.lhs().coeff(i) * prod.rhs());
+    func(dest.row(i), prod.lhs().coeff(i,0) * prod.rhs());
 }

 template<typename Lhs, typename Rhs>
@@ -397,7 +396,7 @@ struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
  EIGEN_STRONG_INLINE Scalar* data() {
    return ForceAlignment
-            ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(15))) + 16)
+            ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES)
            : m_data.array;
  }
  #endif
@@ -446,7 +445,7 @@ template<> struct gemv_selector<OnTheRight,ColMajor,true>
    if(!evalToDest)
    {
      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = dest.size();
+      Index size = dest.size();
      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
      #endif
      if(!alphaIsCompatible)
@@ -511,7 +510,7 @@ template<> struct gemv_selector<OnTheRight,RowMajor,true>
    if(!DirectlyUseRhs)
    {
      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = actualRhs.size();
+      Index size = actualRhs.size();
      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
      #endif
      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
@@ -565,6 +564,40 @@ template<> struct gemv_selector<OnTheRight,RowMajor,false>
  *
  * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
  */
+#ifndef __CUDACC__
+
+#ifdef EIGEN_TEST_EVALUATORS
+template<typename Derived>
+template<typename OtherDerived>
+inline const Product<Derived, OtherDerived>
+MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
+{
+  // A note regarding the function declaration: In MSVC, this function will sometimes
+  // not be inlined since DenseStorage is an unwindable object for dynamic
+  // matrices and product types are holding a member to store the result.
+  // Thus it does not help tagging this function with EIGEN_STRONG_INLINE.
+  enum {
+    ProductIsValid =  Derived::ColsAtCompileTime==Dynamic
+                   || OtherDerived::RowsAtCompileTime==Dynamic
+                   || int(Derived::ColsAtCompileTime)==int(OtherDerived::RowsAtCompileTime),
+    AreVectors = Derived::IsVectorAtCompileTime && OtherDerived::IsVectorAtCompileTime,
+    SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(Derived,OtherDerived)
+  };
+  // note to the lost user:
+  //    * for a dot product use: v1.dot(v2)
+  //    * for a coeff-wise product use: v1.cwiseProduct(v2)
+  EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes),
+    INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS)
+  EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors),
+    INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
+  EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
+#ifdef EIGEN_DEBUG_PRODUCT
+  internal::product_type<Derived,OtherDerived>::debug();
+#endif
+  
+  return Product<Derived, OtherDerived>(derived(), other.derived());
+}
+#else
 template<typename Derived>
 template<typename OtherDerived>
 inline const typename ProductReturnType<Derived, OtherDerived>::Type
@@ -594,7 +627,9 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 #endif
  return typename ProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
 }
+#endif

+#endif
 /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
  *
  * The returned product will behave like any other expressions: the coefficients of the product will be
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -42,6 +42,8 @@ namespace internal {
 struct default_packet_traits
 {
  enum {
+    HasHalfPacket = 0,
+    
    HasAdd    = 1,
    HasSub    = 1,
    HasMul    = 1,
@@ -71,10 +73,12 @@ struct default_packet_traits
 template<typename T> struct packet_traits : default_packet_traits
 {
  typedef T type;
+  typedef T half;
  enum {
    Vectorizable = 0,
    size = 1,
-    AlignedOnScalar = 0
+    AlignedOnScalar = 0,
+    HasHalfPacket = 0
  };
  enum {
    HasAdd    = 0,
@@ -91,94 +95,149 @@ template<typename T> struct packet_traits : default_packet_traits
 };

 /** \internal \returns a + b (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 padd(const Packet& a,
        const Packet& b) { return a+b; }

 /** \internal \returns a - b (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 psub(const Packet& a,
        const Packet& b) { return a-b; }

 /** \internal \returns -a (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pnegate(const Packet& a) { return -a; }

 /** \internal \returns conj(a) (coeff-wise) */
-template<typename Packet> inline Packet
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pconj(const Packet& a) { return numext::conj(a); }

 /** \internal \returns a * b (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pmul(const Packet& a,
        const Packet& b) { return a*b; }

 /** \internal \returns a / b (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pdiv(const Packet& a,
        const Packet& b) { return a/b; }

 /** \internal \returns the min of \a a and \a b  (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pmin(const Packet& a,
-        const Packet& b) { using std::min; return (min)(a, b); }
+        const Packet& b) { EIGEN_USING_STD_MATH(min); return (min)(a, b); }

 /** \internal \returns the max of \a a and \a b  (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pmax(const Packet& a,
-        const Packet& b) { using std::max; return (max)(a, b); }
+        const Packet& b) { EIGEN_USING_STD_MATH(max); return (max)(a, b); }

 /** \internal \returns the absolute value of \a a */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pabs(const Packet& a) { using std::abs; return abs(a); }

 /** \internal \returns the bitwise and of \a a and \a b */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pand(const Packet& a, const Packet& b) { return a & b; }

 /** \internal \returns the bitwise or of \a a and \a b */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 por(const Packet& a, const Packet& b) { return a | b; }

 /** \internal \returns the bitwise xor of \a a and \a b */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pxor(const Packet& a, const Packet& b) { return a ^ b; }

 /** \internal \returns the bitwise andnot of \a a and \a b */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pandnot(const Packet& a, const Packet& b) { return a & (!b); }

 /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pload(const typename unpacket_traits<Packet>::type* from) { return *from; }

 /** \internal \returns a packet version of \a *from, (un-aligned load) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }

+/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
+
+/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pload1(const typename unpacket_traits<Packet>::type  *a) { return pset1<Packet>(*a); }
+
 /** \internal \returns a packet with elements of \a *from duplicated.
-  * For instance, for a packet of 8 elements, 4 scalar will be read from \a *from and
-  * duplicated to form: {from[0],from[0],from[1],from[1],,from[2],from[2],,from[3],from[3]}
+  * For instance, for a packet of 8 elements, 4 scalars will be read from \a *from and
+  * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
  * Currently, this function is only used for scalar * complex products.
- */
-template<typename Packet> inline Packet
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }

-/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
-template<typename Packet> inline Packet
-pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
+/** \internal \returns a packet with elements of \a *from quadrupled.
+  * For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and
+  * replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]}
+  * Currently, this function is only used in matrix products.
+  * For packet-size smaller or equal to 4, this function is equivalent to pload1 
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+ploadquad(const typename unpacket_traits<Packet>::type* from)
+{ return pload1<Packet>(from); }
+
+/** \internal equivalent to
+  * \code
+  * a0 = pload1(a+0);
+  * a1 = pload1(a+1);
+  * a2 = pload1(a+2);
+  * a3 = pload1(a+3);
+  * \endcode
+  * \sa pset1, pload1, ploaddup, pbroadcast2
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC
+inline void pbroadcast4(const typename unpacket_traits<Packet>::type *a,
+                        Packet& a0, Packet& a1, Packet& a2, Packet& a3)
+{
+  a0 = pload1<Packet>(a+0);
+  a1 = pload1<Packet>(a+1);
+  a2 = pload1<Packet>(a+2);
+  a3 = pload1<Packet>(a+3);
+}
+
+/** \internal equivalent to
+  * \code
+  * a0 = pload1(a+0);
+  * a1 = pload1(a+1);
+  * \endcode
+  * \sa pset1, pload1, ploaddup, pbroadcast4
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC
+inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
+                        Packet& a0, Packet& a1)
+{
+  a0 = pload1<Packet>(a+0);
+  a1 = pload1<Packet>(a+1);
+}

 /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
 template<typename Scalar> inline typename packet_traits<Scalar>::type
 plset(const Scalar& a) { return a; }

 /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
-template<typename Scalar, typename Packet> inline void pstore(Scalar* to, const Packet& from)
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from)
 { (*to) = from; }

 /** \internal copy the packet \a from to \a *to, (un-aligned store) */
-template<typename Scalar, typename Packet> inline void pstoreu(Scalar* to, const Packet& from)
-{ (*to) = from; }
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
+{  (*to) = from; }
+
+ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, DenseIndex /*stride*/)
+ { return ploadu<Packet>(from); }
+
+ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, DenseIndex /*stride*/)
+ { pstore(to, from); }

 /** \internal tries to do cache prefetching of \a addr */
 template<typename Scalar> inline void prefetch(const Scalar* addr)
@@ -189,36 +248,45 @@ __builtin_prefetch(addr);
 }

 /** \internal \returns the first element of a packet */
-template<typename Packet> inline typename unpacket_traits<Packet>::type pfirst(const Packet& a)
+template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type pfirst(const Packet& a)
 { return a; }

 /** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 preduxp(const Packet* vecs) { return vecs[0]; }

 /** \internal \returns the sum of the elements of \a a*/
-template<typename Packet> inline typename unpacket_traits<Packet>::type predux(const Packet& a)
+template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
+{ return a; }
+
+/** \internal \returns the sum of the elements of \a a by block of 4 elements.
+  * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
+  * For packet-size smaller or equal to 4, this boils down to a noop.
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC inline
+typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
+predux4(const Packet& a)
 { return a; }

 /** \internal \returns the product of the elements of \a a*/
-template<typename Packet> inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
+template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
 { return a; }

 /** \internal \returns the min of the elements of \a a*/
-template<typename Packet> inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
+template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
 { return a; }

 /** \internal \returns the max of the elements of \a a*/
-template<typename Packet> inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
+template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
 { return a; }

 /** \internal \returns the reversed elements of \a a*/
-template<typename Packet> inline Packet preverse(const Packet& a)
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
 { return a; }


 /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
-template<typename Packet> inline Packet pcplxflip(const Packet& a)
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
 {
  // FIXME: uncomment the following in case we drop the internal imag and real functions.
 //   using std::imag;
@@ -250,6 +318,10 @@ Packet pasin(const Packet& a) { using std::asin; return asin(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pacos(const Packet& a) { using std::acos; return acos(a); }

+/** \internal \returns the atan of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet patan(const Packet& a) { using std::atan; return atan(a); }
+
 /** \internal \returns the exp of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pexp(const Packet& a) { using std::exp; return exp(a); }
@@ -275,7 +347,7 @@ inline void pstore1(typename unpacket_traits<Packet>::type* to, const typename u
 }

 /** \internal \returns a * b + c (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pmadd(const Packet&  a,
         const Packet&  b,
         const Packet&  c)
@@ -336,15 +408,33 @@ inline void palign(PacketType& first, const PacketType& second)
 * Fast complex products (GCC generates a function call which is very slow)
 ***************************************************************************/

+// Eigen+CUDA does not support complexes.
+#ifndef __CUDACC__
+
 template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
 { return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }

 template<> inline std::complex<double> pmul(const std::complex<double>& a, const std::complex<double>& b)
 { return std::complex<double>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }

+#endif
+
+
+/***************************************************************************
+ * PacketBlock, that is a collection of N packets where the number of words
+ * in the packet is a multiple of N.
+***************************************************************************/
+template <typename Packet,int N=unpacket_traits<Packet>::size> struct PacketBlock {
+  Packet packet[N];
+};
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet,1>& /*kernel*/) {
+  // Nothing to do in the scalar case, i.e. a 1x1 matrix.
+}
+
 } // end namespace internal

 } // end namespace Eigen

 #endif // EIGEN_GENERIC_PACKET_MATH_H
-
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -45,6 +45,7 @@ namespace Eigen
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan,scalar_atan_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op)
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@@ -49,7 +49,7 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
  */
 struct IOFormat
 {
-  /** Default contructor, see class IOFormat for the meaning of the parameters */
+  /** Default constructor, see class IOFormat for the meaning of the parameters */
  IOFormat(int _precision = StreamPrecision, int _flags = 0,
    const std::string& _coeffSeparator = " ",
    const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="",
@@ -57,6 +57,10 @@ struct IOFormat
  : matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator),
    rowSpacer(""), coeffSeparator(_coeffSeparator), precision(_precision), flags(_flags)
  {
+    // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline
+    // don't add rowSpacer if columns are not to be aligned
+    if((flags & DontAlignCols))
+      return;
    int i = int(matSuffix.length())-1;
    while (i>=0 && matSuffix[i]!='\n')
    {
@@ -185,21 +189,22 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
    explicit_precision = fmt.precision;
  }

+  std::streamsize old_precision = 0;
+  if(explicit_precision) old_precision = s.precision(explicit_precision);
+
  bool align_cols = !(fmt.flags & DontAlignCols);
  if(align_cols)
  {
    // compute the largest width
-    for(Index j = 1; j < m.cols(); ++j)
+    for(Index j = 0; j < m.cols(); ++j)
      for(Index i = 0; i < m.rows(); ++i)
      {
        std::stringstream sstr;
-        if(explicit_precision) sstr.precision(explicit_precision);
+        sstr.copyfmt(s);
        sstr << m.coeff(i,j);
        width = std::max<Index>(width, Index(sstr.str().length()));
      }
  }
-  std::streamsize old_precision = 0;
-  if(explicit_precision) old_precision = s.precision(explicit_precision);
  s << fmt.matPrefix;
  for(Index i = 0; i < m.rows(); ++i)
  {
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -88,7 +88,7 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
                        && ( bool(IsDynamicSize)
                           || HasNoOuterStride
                           || ( OuterStrideAtCompileTime!=Dynamic
-                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%16)==0 ) ),
+                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%EIGEN_ALIGN_BYTES)==0 ) ),
    Flags0 = TraitsBase::Flags & (~NestByRefBit),
    Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit),
    Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
@@ -110,19 +110,17 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
    EIGEN_DENSE_PUBLIC_INTERFACE(Map)

    typedef typename Base::PointerType PointerType;
-#if EIGEN2_SUPPORT_STAGE <= STAGE30_FULL_EIGEN3_API
-    typedef const Scalar* PointerArgType;
-    inline PointerType cast_to_pointer_type(PointerArgType ptr) { return const_cast<PointerType>(ptr); }
-#else
    typedef PointerType PointerArgType;
+    EIGEN_DEVICE_FUNC
    inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; }
-#endif

+    EIGEN_DEVICE_FUNC
    inline Index innerStride() const
    {
      return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
    }

+    EIGEN_DEVICE_FUNC
    inline Index outerStride() const
    {
      return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
@@ -136,6 +134,7 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
      * \param dataPtr pointer to the array to map
      * \param a_stride optional Stride object, passing the strides.
      */
+    EIGEN_DEVICE_FUNC
    inline Map(PointerArgType dataPtr, const StrideType& a_stride = StrideType())
      : Base(cast_to_pointer_type(dataPtr)), m_stride(a_stride)
    {
@@ -148,6 +147,7 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
      * \param a_size the size of the vector expression
      * \param a_stride optional Stride object, passing the strides.
      */
+    EIGEN_DEVICE_FUNC
    inline Map(PointerArgType dataPtr, Index a_size, const StrideType& a_stride = StrideType())
      : Base(cast_to_pointer_type(dataPtr), a_size), m_stride(a_stride)
    {
@@ -161,6 +161,7 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
      * \param nbCols the number of columns of the matrix expression
      * \param a_stride optional Stride object, passing the strides.
      */
+    EIGEN_DEVICE_FUNC
    inline Map(PointerArgType dataPtr, Index nbRows, Index nbCols, const StrideType& a_stride = StrideType())
      : Base(cast_to_pointer_type(dataPtr), nbRows, nbCols), m_stride(a_stride)
    {
@@ -173,19 +174,6 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
    StrideType m_stride;
 };

-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-inline Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>
-  ::Array(const Scalar *data)
-{
-  this->_set_noalias(Eigen::Map<const Array>(data));
-}
-
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-inline Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>
-  ::Matrix(const Scalar *data)
-{
-  this->_set_noalias(Eigen::Map<const Matrix>(data));
-}

 } // end namespace Eigen

--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -76,8 +76,8 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>

    typedef typename Base::CoeffReturnType CoeffReturnType;

-    inline Index rows() const { return m_rows.value(); }
-    inline Index cols() const { return m_cols.value(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_rows.value(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_cols.value(); }

    /** Returns a pointer to the first coefficient of the matrix or vector.
      *
@@ -87,22 +87,26 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      */
    inline const Scalar* data() const { return m_data; }

+    EIGEN_DEVICE_FUNC
    inline const Scalar& coeff(Index rowId, Index colId) const
    {
      return m_data[colId * colStride() + rowId * rowStride()];
    }

+    EIGEN_DEVICE_FUNC
    inline const Scalar& coeff(Index index) const
    {
      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
      return m_data[index * innerStride()];
    }

+    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
      return this->m_data[colId * colStride() + rowId * rowStride()];
    }

+    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
@@ -123,12 +127,14 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      return internal::ploadt<PacketScalar, LoadMode>(m_data + index * innerStride());
    }

+    EIGEN_DEVICE_FUNC
    inline MapBase(PointerType dataPtr) : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
    {
      EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
      checkSanity();
    }

+    EIGEN_DEVICE_FUNC
    inline MapBase(PointerType dataPtr, Index vecSize)
            : m_data(dataPtr),
              m_rows(RowsAtCompileTime == Dynamic ? vecSize : Index(RowsAtCompileTime)),
@@ -140,6 +146,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      checkSanity();
    }

+    EIGEN_DEVICE_FUNC
    inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols)
            : m_data(dataPtr), m_rows(nbRows), m_cols(nbCols)
    {
@@ -151,12 +158,13 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>

  protected:

+    EIGEN_DEVICE_FUNC
    void checkSanity() const
    {
      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(internal::traits<Derived>::Flags&PacketAccessBit,
                                        internal::inner_stride_at_compile_time<Derived>::ret==1),
                          PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
-      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % 16) == 0)
+      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % EIGEN_ALIGN_BYTES) == 0)
                   && "data is not aligned");
    }

@@ -195,14 +203,18 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
                    const Scalar
                  >::type ScalarWithConstIfNotLvalue;

+    EIGEN_DEVICE_FUNC
    inline const Scalar* data() const { return this->m_data; }
+    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue* data() { return this->m_data; } // no const-cast here so non-const-correct code will give a compile error

+    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue& coeffRef(Index row, Index col)
    {
      return this->m_data[col * colStride() + row * rowStride()];
    }

+    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue& coeffRef(Index index)
    {
      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
@@ -224,10 +236,11 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
                (this->m_data + index * innerStride(), val);
    }

-    explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {}
-    inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {}
-    inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols) : Base(dataPtr, nbRows, nbCols) {}
+    EIGEN_DEVICE_FUNC explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {}
+    EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {}
+    EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols) : Base(dataPtr, nbRows, nbCols) {}

+    EIGEN_DEVICE_FUNC
    Derived& operator=(const MapBase& other)
    {
      Base::Base::operator=(other);
@@ -237,6 +250,8 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
    using Base::Base::operator=;
 };

+#undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS
+
 } // end namespace Eigen

 #endif // EIGEN_MAPBASE_H
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -12,6 +12,15 @@

 namespace Eigen {

+// On WINCE, std::abs is defined for int only, so let's defined our own overloads:
+// This issue has been confirmed with MSVC 2008 only, but the issue might exist for more recent versions too.
+#if defined(_WIN32_WCE) && defined(_MSC_VER) && _MSC_VER<=1500
+long        abs(long        x) { return (labs(x));  }
+double      abs(double      x) { return (fabs(x));  }
+float       abs(float       x) { return (fabsf(x)); }
+long double abs(long double x) { return (fabsl(x)); }
+#endif
+  
 namespace internal {

 /** \internal \struct global_math_functions_filtering_base
@@ -62,6 +71,7 @@ template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct real_default_impl
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
    return x;
@@ -72,6 +82,7 @@ template<typename Scalar>
 struct real_default_impl<Scalar,true>
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
    using std::real;
@@ -87,7 +98,6 @@ struct real_retval
  typedef typename NumTraits<Scalar>::Real type;
 };

-
 /****************************************************************************
 * Implementation of imag                                                 *
 ****************************************************************************/
@@ -96,6 +106,7 @@ template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct imag_default_impl
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar&)
  {
    return RealScalar(0);
@@ -106,6 +117,7 @@ template<typename Scalar>
 struct imag_default_impl<Scalar,true>
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
    using std::imag;
@@ -129,10 +141,12 @@ template<typename Scalar>
 struct real_ref_impl
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
  static inline RealScalar& run(Scalar& x)
  {
    return reinterpret_cast<RealScalar*>(&x)[0];
  }
+  EIGEN_DEVICE_FUNC
  static inline const RealScalar& run(const Scalar& x)
  {
    return reinterpret_cast<const RealScalar*>(&x)[0];
@@ -153,10 +167,12 @@ template<typename Scalar, bool IsComplex>
 struct imag_ref_default_impl
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
  static inline RealScalar& run(Scalar& x)
  {
    return reinterpret_cast<RealScalar*>(&x)[1];
  }
+  EIGEN_DEVICE_FUNC
  static inline const RealScalar& run(const Scalar& x)
  {
    return reinterpret_cast<RealScalar*>(&x)[1];
@@ -166,10 +182,12 @@ struct imag_ref_default_impl
 template<typename Scalar>
 struct imag_ref_default_impl<Scalar, false>
 {
+  EIGEN_DEVICE_FUNC
  static inline Scalar run(Scalar&)
  {
    return Scalar(0);
  }
+  EIGEN_DEVICE_FUNC
  static inline const Scalar run(const Scalar&)
  {
    return Scalar(0);
@@ -192,6 +210,7 @@ struct imag_ref_retval
 template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct conj_impl
 {
+  EIGEN_DEVICE_FUNC
  static inline Scalar run(const Scalar& x)
  {
    return x;
@@ -201,6 +220,7 @@ struct conj_impl
 template<typename Scalar>
 struct conj_impl<Scalar,true>
 {
+  EIGEN_DEVICE_FUNC
  static inline Scalar run(const Scalar& x)
  {
    using std::conj;
@@ -222,6 +242,7 @@ template<typename Scalar>
 struct abs2_impl
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
    return x*x;
@@ -231,6 +252,7 @@ struct abs2_impl
 template<typename RealScalar>
 struct abs2_impl<std::complex<RealScalar> >
 {
+  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const std::complex<RealScalar>& x)
  {
    return real(x)*real(x) + imag(x)*imag(x);
@@ -251,6 +273,7 @@ template<typename Scalar, bool IsComplex>
 struct norm1_default_impl
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
    using std::abs;
@@ -261,6 +284,7 @@ struct norm1_default_impl
 template<typename Scalar>
 struct norm1_default_impl<Scalar, false>
 {
+  EIGEN_DEVICE_FUNC
  static inline Scalar run(const Scalar& x)
  {
    using std::abs;
@@ -287,16 +311,23 @@ struct hypot_impl
  typedef typename NumTraits<Scalar>::Real RealScalar;
  static inline RealScalar run(const Scalar& x, const Scalar& y)
  {
-    using std::max;
-    using std::min;
+    EIGEN_USING_STD_MATH(max);
+    EIGEN_USING_STD_MATH(min);
    using std::abs;
    using std::sqrt;
    RealScalar _x = abs(x);
    RealScalar _y = abs(y);
-    RealScalar p = (max)(_x, _y);
-    if(p==RealScalar(0)) return 0;
-    RealScalar q = (min)(_x, _y);
-    RealScalar qp = q/p;
+    Scalar p, qp;
+    if(_x>_y)
+    {
+      p = _x;
+      qp = _y / p;
+    }
+    else
+    {
+      p = _y;
+      qp = _x / p;
+    }
    return p * sqrt(RealScalar(1) + qp*qp);
  }
 };
@@ -332,37 +363,45 @@ inline NewType cast(const OldType& x)
 * Implementation of atanh2                                                *
 ****************************************************************************/

-template<typename Scalar, bool IsInteger>
-struct atanh2_default_impl
+template<typename Scalar>
+struct atanh2_impl
 {
-  typedef Scalar retval;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline Scalar run(const Scalar& x, const Scalar& y)
+  static inline Scalar run(const Scalar& x, const Scalar& r)
+  {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+    #if (__cplusplus >= 201103L) && !defined(__CYGWIN__)
+      using std::log1p;
+      return log1p(2 * x / (r - x)) / 2;
+    #else
+      using std::abs;
+      using std::log;
+      using std::sqrt;
+      Scalar z = x / r;
+      if (r == 0 || abs(z) > sqrt(NumTraits<Scalar>::epsilon()))
+        return log((r + x) / (r - x)) / 2;
+      else
+        return z + z*z*z / 3;
+    #endif
+  }
+};
+
+template<typename RealScalar>
+struct atanh2_impl<std::complex<RealScalar> >
+{
+  typedef std::complex<RealScalar> Scalar;
+  static inline Scalar run(const Scalar& x, const Scalar& r)
  {
-    using std::abs;
    using std::log;
+    using std::norm;
    using std::sqrt;
-    Scalar z = x / y;
-    if (y == Scalar(0) || abs(z) > sqrt(NumTraits<RealScalar>::epsilon()))
-      return RealScalar(0.5) * log((y + x) / (y - x));
+    Scalar z = x / r;
+    if (r == Scalar(0) || norm(z) > NumTraits<RealScalar>::epsilon())
+      return RealScalar(0.5) * log((r + x) / (r - x));
    else
      return z + z*z*z / RealScalar(3);
  }
 };

-template<typename Scalar>
-struct atanh2_default_impl<Scalar, true>
-{
-  static inline Scalar run(const Scalar&, const Scalar&)
-  {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-    return Scalar(0);
-  }
-};
-
-template<typename Scalar>
-struct atanh2_impl : atanh2_default_impl<Scalar, NumTraits<Scalar>::IsInteger> {};
-
 template<typename Scalar>
 struct atanh2_retval
 {
@@ -554,72 +593,84 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random()
 namespace numext {

 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(real, Scalar) real(const Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(real, Scalar)::run(x);
 }  

 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline typename internal::add_const_on_value_type< EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) >::type real_ref(const Scalar& x)
 {
  return internal::real_ref_impl<Scalar>::run(x);
 }

 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) real_ref(Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(real_ref, Scalar)::run(x);
 }

 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(imag, Scalar) imag(const Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x);
 }

 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline typename internal::add_const_on_value_type< EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) >::type imag_ref(const Scalar& x)
 {
  return internal::imag_ref_impl<Scalar>::run(x);
 }

 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) imag_ref(Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(imag_ref, Scalar)::run(x);
 }

 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(conj, Scalar) conj(const Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(conj, Scalar)::run(x);
 }

 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
 }

 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(norm1, Scalar)::run(x);
 }

 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar& y)
 {
  return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
 }

 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y)
 {
  return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y);
 }

 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(pow, Scalar) pow(const Scalar& x, const Scalar& y)
 {
  return EIGEN_MATHFUNC_IMPL(pow, Scalar)::run(x, y);
@@ -627,11 +678,22 @@ inline EIGEN_MATHFUNC_RETVAL(pow, Scalar) pow(const Scalar& x, const Scalar& y)

 // std::isfinite is non standard, so let's define our own version,
 // even though it is not very efficient.
-template<typename T> bool (isfinite)(const T& x)
+template<typename T>
+EIGEN_DEVICE_FUNC
+bool (isfinite)(const T& x)
 {
  return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest();
 }

+template<typename T>
+EIGEN_DEVICE_FUNC
+bool (isfinite)(const std::complex<T>& x)
+{
+  using std::real;
+  using std::imag;
+  return isfinite(real(x)) && isfinite(imag(x));
+}
+
 } // end namespace numext

 namespace internal {
@@ -649,18 +711,20 @@ template<typename Scalar>
 struct scalar_fuzzy_default_impl<Scalar, false, false>
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  template<typename OtherScalar>
+  template<typename OtherScalar> EIGEN_DEVICE_FUNC
  static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
  {
    using std::abs;
    return abs(x) <= abs(y) * prec;
  }
+  EIGEN_DEVICE_FUNC
  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
  {
-    using std::min;
+    EIGEN_USING_STD_MATH(min);
    using std::abs;
    return abs(x - y) <= (min)(abs(x), abs(y)) * prec;
  }
+  EIGEN_DEVICE_FUNC
  static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar& prec)
  {
    return x <= y || isApprox(x, y, prec);
@@ -671,15 +735,17 @@ template<typename Scalar>
 struct scalar_fuzzy_default_impl<Scalar, false, true>
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  template<typename OtherScalar>
+  template<typename OtherScalar> EIGEN_DEVICE_FUNC
  static inline bool isMuchSmallerThan(const Scalar& x, const Scalar&, const RealScalar&)
  {
    return x == Scalar(0);
  }
+  EIGEN_DEVICE_FUNC
  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar&)
  {
    return x == y;
  }
+  EIGEN_DEVICE_FUNC
  static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar&)
  {
    return x <= y;
@@ -697,7 +763,7 @@ struct scalar_fuzzy_default_impl<Scalar, true, false>
  }
  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
  {
-    using std::min;
+    EIGEN_USING_STD_MATH(min);
    return numext::abs2(x - y) <= (min)(numext::abs2(x), numext::abs2(y)) * prec * prec;
  }
 };
@@ -705,21 +771,21 @@ struct scalar_fuzzy_default_impl<Scalar, true, false>
 template<typename Scalar>
 struct scalar_fuzzy_impl : scalar_fuzzy_default_impl<Scalar, NumTraits<Scalar>::IsComplex, NumTraits<Scalar>::IsInteger> {};

-template<typename Scalar, typename OtherScalar>
+template<typename Scalar, typename OtherScalar> EIGEN_DEVICE_FUNC
 inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y,
                                   typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
 {
  return scalar_fuzzy_impl<Scalar>::template isMuchSmallerThan<OtherScalar>(x, y, precision);
 }

-template<typename Scalar>
+template<typename Scalar> EIGEN_DEVICE_FUNC
 inline bool isApprox(const Scalar& x, const Scalar& y,
                          typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
 {
  return scalar_fuzzy_impl<Scalar>::isApprox(x, y, precision);
 }

-template<typename Scalar>
+template<typename Scalar> EIGEN_DEVICE_FUNC
 inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y,
                                    typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
 {
@@ -742,17 +808,19 @@ template<> struct scalar_fuzzy_impl<bool>
 {
  typedef bool RealScalar;
  
-  template<typename OtherScalar>
+  template<typename OtherScalar> EIGEN_DEVICE_FUNC
  static inline bool isMuchSmallerThan(const bool& x, const bool&, const bool&)
  {
    return !x;
  }
  
+  EIGEN_DEVICE_FUNC
  static inline bool isApprox(bool x, bool y, bool)
  {
    return x == y;
  }

+  EIGEN_DEVICE_FUNC
  static inline bool isApproxOrLessThan(const bool& x, const bool& y, const bool&)
  {
    return (!x) || y;
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -151,6 +151,7 @@ class Matrix
      *
      * \callgraph
      */
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix& operator=(const Matrix& other)
    {
      return Base::_set(other);
@@ -167,6 +168,7 @@ class Matrix
      * remain row-vectors and vectors remain vectors.
      */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix& operator=(const MatrixBase<OtherDerived>& other)
    {
      return Base::_set(other);
@@ -179,12 +181,14 @@ class Matrix
      * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
      */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix& operator=(const EigenBase<OtherDerived> &other)
    {
      return Base::operator=(other);
    }

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix& operator=(const ReturnByValue<OtherDerived>& func)
    {
      return Base::operator=(func);
@@ -200,6 +204,7 @@ class Matrix
      *
      * \sa resize(Index,Index)
      */
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix() : Base()
    {
      Base::_check_template_params();
@@ -207,45 +212,85 @@ class Matrix
    }

    // FIXME is it still needed
+    EIGEN_DEVICE_FUNC
    Matrix(internal::constructor_without_unaligned_array_assert)
      : Base(internal::constructor_without_unaligned_array_assert())
    { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }

-    /** \brief Constructs a vector or row-vector with given dimension. \only_for_vectors
-      *
-      * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
-      * it is redundant to pass the dimension here, so it makes more sense to use the default
-      * constructor Matrix() instead.
-      */
-    EIGEN_STRONG_INLINE explicit Matrix(Index dim)
-      : Base(dim, RowsAtCompileTime == 1 ? 1 : dim, ColsAtCompileTime == 1 ? 1 : dim)
+#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+    Matrix(Matrix&& other)
+      : Base(std::move(other))
    {
      Base::_check_template_params();
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Matrix)
-      eigen_assert(dim >= 0);
-      eigen_assert(SizeAtCompileTime == Dynamic || SizeAtCompileTime == dim);
-      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+      if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
+        Base::_set_noalias(other);
    }
+    Matrix& operator=(Matrix&& other)
+    {
+      other.swap(*this);
+      return *this;
+    }
+#endif

    #ifndef EIGEN_PARSED_BY_DOXYGEN
+
+    // This constructor is for both 1x1 matrices and dynamic vectors
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE explicit Matrix(const T& x)
+    {
+      Base::_check_template_params();
+      Base::template _init1<T>(x);
+    }
+
    template<typename T0, typename T1>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y)
    {
      Base::_check_template_params();
      Base::template _init2<T0,T1>(x, y);
    }
    #else
+    /** \brief Constructs a fixed-sized matrix initialized with coefficients starting at \a data */
+    EIGEN_DEVICE_FUNC
+    explicit Matrix(const Scalar *data);
+
+    /** \brief Constructs a vector or row-vector with given dimension. \only_for_vectors
+      *
+      * This is useful for dynamic-size vectors. For fixed-size vectors,
+      * it is redundant to pass these parameters, so one should use the default constructor
+      * Matrix() instead.
+      * 
+      * \warning This constructor is disabled for fixed-size \c 1x1 matrices. For instance,
+      * calling Matrix<double,1,1>(1) will call the initialization constructor: Matrix(const Scalar&).
+      * For fixed-size \c 1x1 matrices it is thefore recommended to use the default
+      * constructor Matrix() instead, especilly when using one of the non standard
+      * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
+      */
+    EIGEN_STRONG_INLINE explicit Matrix(Index dim);
+    /** \brief Constructs an initialized 1x1 matrix with the given coefficient */
+    Matrix(const Scalar& x);
    /** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns.
      *
      * This is useful for dynamic-size matrices. For fixed-size matrices,
      * it is redundant to pass these parameters, so one should use the default constructor
-      * Matrix() instead. */
+      * Matrix() instead.
+      * 
+      * \warning This constructor is disabled for fixed-size \c 1x2 and \c 2x1 vectors. For instance,
+      * calling Matrix2f(2,1) will call the initialization constructor: Matrix(const Scalar& x, const Scalar& y).
+      * For fixed-size \c 1x2 or \c 2x1 vectors it is thefore recommended to use the default
+      * constructor Matrix() instead, especilly when using one of the non standard
+      * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
+      */
+    EIGEN_DEVICE_FUNC
    Matrix(Index rows, Index cols);
+    
    /** \brief Constructs an initialized 2D vector with given coefficients */
    Matrix(const Scalar& x, const Scalar& y);
    #endif

    /** \brief Constructs an initialized 3D vector with given coefficients */
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z)
    {
      Base::_check_template_params();
@@ -255,6 +300,7 @@ class Matrix
      m_storage.data()[2] = z;
    }
    /** \brief Constructs an initialized 4D vector with given coefficients */
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w)
    {
      Base::_check_template_params();
@@ -265,10 +311,10 @@ class Matrix
      m_storage.data()[3] = w;
    }

-    explicit Matrix(const Scalar *data);

    /** \brief Constructor copying the value of the expression \a other */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const MatrixBase<OtherDerived>& other)
             : Base(other.rows() * other.cols(), other.rows(), other.cols())
    {
@@ -281,6 +327,7 @@ class Matrix
      Base::_set_noalias(other);
    }
    /** \brief Copy constructor */
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const Matrix& other)
            : Base(other.rows() * other.cols(), other.rows(), other.cols())
    {
@@ -289,6 +336,7 @@ class Matrix
    }
    /** \brief Copy constructor with in-place evaluation */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const ReturnByValue<OtherDerived>& other)
    {
      Base::_check_template_params();
@@ -300,11 +348,12 @@ class Matrix
      * \sa MatrixBase::operator=(const EigenBase<OtherDerived>&)
      */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const EigenBase<OtherDerived> &other)
      : Base(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
    {
      Base::_check_template_params();
-      Base::resize(other.rows(), other.cols());
+      Base::_resize_to_match(other);
      // FIXME/CHECK: isn't *this = other.derived() more efficient. it allows to
      //              go for pure _set() implementations, right?
      *this = other;
@@ -315,26 +364,22 @@ class Matrix
      * of same type it is enough to swap the data pointers.
      */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    void swap(MatrixBase<OtherDerived> const & other)
    { this->_swap(other.derived()); }

-    inline Index innerStride() const { return 1; }
-    inline Index outerStride() const { return this->innerSize(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }

    /////////// Geometry module ///////////

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    explicit Matrix(const RotationBase<OtherDerived,ColsAtCompileTime>& r);
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    Matrix& operator=(const RotationBase<OtherDerived,ColsAtCompileTime>& r);

-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived>
-    explicit Matrix(const eigen2_RotationBase<OtherDerived,ColsAtCompileTime>& r);
-    template<typename OtherDerived>
-    Matrix& operator=(const eigen2_RotationBase<OtherDerived,ColsAtCompileTime>& r);
-    #endif
-
    // allow to extend Matrix outside Eigen
    #ifdef EIGEN_MATRIX_PLUGIN
    #include EIGEN_MATRIX_PLUGIN
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -81,6 +81,7 @@ template<typename Derived> class MatrixBase
    using Base::operator-=;
    using Base::operator*=;
    using Base::operator/=;
+    using Base::operator*;

    typedef typename Base::CoeffReturnType CoeffReturnType;
    typedef typename Base::ConstTransposeReturnType ConstTransposeReturnType;
@@ -98,6 +99,7 @@ template<typename Derived> class MatrixBase

    /** \returns the size of the main diagonal, which is min(rows(),cols()).
      * \sa rows(), cols(), SizeAtCompileTime. */
+    EIGEN_DEVICE_FUNC
    inline Index diagonalSize() const { return (std::min)(rows(),cols()); }

    /** \brief The plain matrix type corresponding to this expression.
@@ -145,38 +147,59 @@ template<typename Derived> class MatrixBase
    /** Special case of the template operator=, in order to prevent the compiler
      * from generating a default operator= (issue hit with g++ 4.1)
      */
+    EIGEN_DEVICE_FUNC
    Derived& operator=(const MatrixBase& other);

    // We cannot inherit here via Base::operator= since it is causing
    // trouble with MSVC.

    template <typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    Derived& operator=(const DenseBase<OtherDerived>& other);

    template <typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    Derived& operator=(const EigenBase<OtherDerived>& other);

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    Derived& operator=(const ReturnByValue<OtherDerived>& other);

 #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename ProductDerived, typename Lhs, typename Rhs>
+    EIGEN_DEVICE_FUNC
    Derived& lazyAssign(const ProductBase<ProductDerived, Lhs,Rhs>& other);
-
-    template<typename MatrixPower, typename Lhs, typename Rhs>
-    Derived& lazyAssign(const MatrixPowerProduct<MatrixPower, Lhs,Rhs>& other);
 #endif // not EIGEN_PARSED_BY_DOXYGEN

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    Derived& operator+=(const MatrixBase<OtherDerived>& other);
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    Derived& operator-=(const MatrixBase<OtherDerived>& other);

+#ifdef __CUDACC__
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    const typename LazyProductReturnType<Derived,OtherDerived>::Type
+    operator*(const MatrixBase<OtherDerived> &other) const
+    { return this->lazyProduct(other); }
+#else
+
+#ifdef EIGEN_TEST_EVALUATORS
+    template<typename OtherDerived>
+    const Product<Derived,OtherDerived>
+    operator*(const MatrixBase<OtherDerived> &other) const;
+#else
    template<typename OtherDerived>
    const typename ProductReturnType<Derived,OtherDerived>::Type
    operator*(const MatrixBase<OtherDerived> &other) const;
+#endif
+
+#endif

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC 
    const typename LazyProductReturnType<Derived,OtherDerived>::Type
    lazyProduct(const MatrixBase<OtherDerived> &other) const;

@@ -190,39 +213,44 @@ template<typename Derived> class MatrixBase
    void applyOnTheRight(const EigenBase<OtherDerived>& other);

    template<typename DiagonalDerived>
+    EIGEN_DEVICE_FUNC
    const DiagonalProduct<Derived, DiagonalDerived, OnTheRight>
    operator*(const DiagonalBase<DiagonalDerived> &diagonal) const;

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
    dot(const MatrixBase<OtherDerived>& other) const;

-    #ifdef EIGEN2_SUPPORT
-      template<typename OtherDerived>
-      Scalar eigen2_dot(const MatrixBase<OtherDerived>& other) const;
-    #endif
-
-    RealScalar squaredNorm() const;
-    RealScalar norm() const;
+    EIGEN_DEVICE_FUNC RealScalar squaredNorm() const;
+    EIGEN_DEVICE_FUNC RealScalar norm() const;
    RealScalar stableNorm() const;
    RealScalar blueNorm() const;
    RealScalar hypotNorm() const;
-    const PlainObject normalized() const;
-    void normalize();
+    EIGEN_DEVICE_FUNC const PlainObject normalized() const;
+    EIGEN_DEVICE_FUNC void normalize();

-    const AdjointReturnType adjoint() const;
-    void adjointInPlace();
+    EIGEN_DEVICE_FUNC const AdjointReturnType adjoint() const;
+    EIGEN_DEVICE_FUNC void adjointInPlace();

    typedef Diagonal<Derived> DiagonalReturnType;
+    EIGEN_DEVICE_FUNC
    DiagonalReturnType diagonal();
-	typedef typename internal::add_const<Diagonal<const Derived> >::type ConstDiagonalReturnType;
+    
+    typedef typename internal::add_const<Diagonal<const Derived> >::type ConstDiagonalReturnType;
+    EIGEN_DEVICE_FUNC
    ConstDiagonalReturnType diagonal() const;

    template<int Index> struct DiagonalIndexReturnType { typedef Diagonal<Derived,Index> Type; };
    template<int Index> struct ConstDiagonalIndexReturnType { typedef const Diagonal<const Derived,Index> Type; };

-    template<int Index> typename DiagonalIndexReturnType<Index>::Type diagonal();
-    template<int Index> typename ConstDiagonalIndexReturnType<Index>::Type diagonal() const;
+    template<int Index> 
+    EIGEN_DEVICE_FUNC
+    typename DiagonalIndexReturnType<Index>::Type diagonal();
+
+    template<int Index>
+    EIGEN_DEVICE_FUNC
+    typename ConstDiagonalIndexReturnType<Index>::Type diagonal() const;

    // Note: The "MatrixBase::" prefixes are added to help MSVC9 to match these declarations with the later implementations.
    // On the other hand they confuse MSVC8...
@@ -230,48 +258,51 @@ template<typename Derived> class MatrixBase
    typename MatrixBase::template DiagonalIndexReturnType<DynamicIndex>::Type diagonal(Index index);
    typename MatrixBase::template ConstDiagonalIndexReturnType<DynamicIndex>::Type diagonal(Index index) const;
    #else
+    EIGEN_DEVICE_FUNC
    typename DiagonalIndexReturnType<DynamicIndex>::Type diagonal(Index index);
+    
+    EIGEN_DEVICE_FUNC
    typename ConstDiagonalIndexReturnType<DynamicIndex>::Type diagonal(Index index) const;
    #endif

-    #ifdef EIGEN2_SUPPORT
-    template<unsigned int Mode> typename internal::eigen2_part_return_type<Derived, Mode>::type part();
-    template<unsigned int Mode> const typename internal::eigen2_part_return_type<Derived, Mode>::type part() const;
-    
-    // huuuge hack. make Eigen2's matrix.part<Diagonal>() work in eigen3. Problem: Diagonal is now a class template instead
-    // of an integer constant. Solution: overload the part() method template wrt template parameters list.
-    template<template<typename T, int N> class U>
-    const DiagonalWrapper<ConstDiagonalReturnType> part() const
-    { return diagonal().asDiagonal(); }
-    #endif // EIGEN2_SUPPORT
-
    template<unsigned int Mode> struct TriangularViewReturnType { typedef TriangularView<Derived, Mode> Type; };
    template<unsigned int Mode> struct ConstTriangularViewReturnType { typedef const TriangularView<const Derived, Mode> Type; };

-    template<unsigned int Mode> typename TriangularViewReturnType<Mode>::Type triangularView();
-    template<unsigned int Mode> typename ConstTriangularViewReturnType<Mode>::Type triangularView() const;
+    template<unsigned int Mode>
+    EIGEN_DEVICE_FUNC
+    typename TriangularViewReturnType<Mode>::Type triangularView();
+    template<unsigned int Mode>
+    EIGEN_DEVICE_FUNC
+    typename ConstTriangularViewReturnType<Mode>::Type triangularView() const;

    template<unsigned int UpLo> struct SelfAdjointViewReturnType { typedef SelfAdjointView<Derived, UpLo> Type; };
    template<unsigned int UpLo> struct ConstSelfAdjointViewReturnType { typedef const SelfAdjointView<const Derived, UpLo> Type; };

-    template<unsigned int UpLo> typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
-    template<unsigned int UpLo> typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;
+    template<unsigned int UpLo> 
+    EIGEN_DEVICE_FUNC
+    typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
+    template<unsigned int UpLo>
+    EIGEN_DEVICE_FUNC
+    typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;

    const SparseView<Derived> sparseView(const Scalar& m_reference = Scalar(0),
                                         const typename NumTraits<Scalar>::Real& m_epsilon = NumTraits<Scalar>::dummy_precision()) const;
-    static const IdentityReturnType Identity();
-    static const IdentityReturnType Identity(Index rows, Index cols);
-    static const BasisReturnType Unit(Index size, Index i);
-    static const BasisReturnType Unit(Index i);
-    static const BasisReturnType UnitX();
-    static const BasisReturnType UnitY();
-    static const BasisReturnType UnitZ();
-    static const BasisReturnType UnitW();
+    EIGEN_DEVICE_FUNC static const IdentityReturnType Identity();
+    EIGEN_DEVICE_FUNC static const IdentityReturnType Identity(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC static const BasisReturnType Unit(Index size, Index i);
+    EIGEN_DEVICE_FUNC static const BasisReturnType Unit(Index i);
+    EIGEN_DEVICE_FUNC static const BasisReturnType UnitX();
+    EIGEN_DEVICE_FUNC static const BasisReturnType UnitY();
+    EIGEN_DEVICE_FUNC static const BasisReturnType UnitZ();
+    EIGEN_DEVICE_FUNC static const BasisReturnType UnitW();

+    EIGEN_DEVICE_FUNC
    const DiagonalWrapper<const Derived> asDiagonal() const;
    const PermutationWrapper<const Derived> asPermutation() const;

+    EIGEN_DEVICE_FUNC
    Derived& setIdentity();
+    EIGEN_DEVICE_FUNC
    Derived& setIdentity(Index rows, Index cols);

    bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
@@ -310,42 +341,26 @@ template<typename Derived> class MatrixBase

    Scalar trace() const;

-/////////// Array module ///////////
+    template<int p> EIGEN_DEVICE_FUNC RealScalar lpNorm() const;

-    template<int p> RealScalar lpNorm() const;
-
-    MatrixBase<Derived>& matrix() { return *this; }
-    const MatrixBase<Derived>& matrix() const { return *this; }
+    EIGEN_DEVICE_FUNC MatrixBase<Derived>& matrix() { return *this; }
+    EIGEN_DEVICE_FUNC const MatrixBase<Derived>& matrix() const { return *this; }

    /** \returns an \link Eigen::ArrayBase Array \endlink expression of this matrix
      * \sa ArrayBase::matrix() */
-    ArrayWrapper<Derived> array() { return derived(); }
-    const ArrayWrapper<const Derived> array() const { return derived(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ArrayWrapper<Derived> array() { return derived(); }
+    /** \returns a const \link Eigen::ArrayBase Array \endlink expression of this matrix
+      * \sa ArrayBase::matrix() */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ArrayWrapper<const Derived> array() const { return derived(); }

 /////////// LU module ///////////

-    const FullPivLU<PlainObject> fullPivLu() const;
-    const PartialPivLU<PlainObject> partialPivLu() const;
+    EIGEN_DEVICE_FUNC const FullPivLU<PlainObject> fullPivLu() const;
+    EIGEN_DEVICE_FUNC const PartialPivLU<PlainObject> partialPivLu() const;

-    #if EIGEN2_SUPPORT_STAGE < STAGE20_RESOLVE_API_CONFLICTS
-    const LU<PlainObject> lu() const;
-    #endif
-
-    #ifdef EIGEN2_SUPPORT
-    const LU<PlainObject> eigen2_lu() const;
-    #endif
-
-    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
    const PartialPivLU<PlainObject> lu() const;
-    #endif
-    
-    #ifdef EIGEN2_SUPPORT
-    template<typename ResultType>
-    void computeInverse(MatrixBase<ResultType> *result) const {
-      *result = this->inverse();
-    }
-    #endif

+    EIGEN_DEVICE_FUNC
    const internal::inverse_impl<Derived> inverse() const;
    template<typename ResultType>
    void computeInverseAndDetWithCheck(
@@ -372,10 +387,6 @@ template<typename Derived> class MatrixBase
    const HouseholderQR<PlainObject> householderQr() const;
    const ColPivHouseholderQR<PlainObject> colPivHouseholderQr() const;
    const FullPivHouseholderQR<PlainObject> fullPivHouseholderQr() const;
-    
-    #ifdef EIGEN2_SUPPORT
-    const QR<PlainObject> qr() const;
-    #endif

    EigenvaluesReturnType eigenvalues() const;
    RealScalar operatorNorm() const;
@@ -384,10 +395,6 @@ template<typename Derived> class MatrixBase

    JacobiSVD<PlainObject> jacobiSvd(unsigned int computationOptions = 0) const;

-    #ifdef EIGEN2_SUPPORT
-    SVD<PlainObject> svd() const;
-    #endif
-
 /////////// Geometry module ///////////

    #ifndef EIGEN_PARSED_BY_DOXYGEN
@@ -398,20 +405,24 @@ template<typename Derived> class MatrixBase
    };
    #endif // EIGEN_PARSED_BY_DOXYGEN
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    typename cross_product_return_type<OtherDerived>::type
    cross(const MatrixBase<OtherDerived>& other) const;
+    
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    PlainObject cross3(const MatrixBase<OtherDerived>& other) const;
+    
+    EIGEN_DEVICE_FUNC
    PlainObject unitOrthogonal(void) const;
+    
    Matrix<Scalar,3,1> eulerAngles(Index a0, Index a1, Index a2) const;
    
-    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
    ScalarMultipleReturnType operator*(const UniformScaling<Scalar>& s) const;
    // put this as separate enum value to work around possible GCC 4.3 bug (?)
    enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1?Vertical:Horizontal };
    typedef Homogeneous<Derived, HomogeneousReturnTypeDirection> HomogeneousReturnType;
    HomogeneousReturnType homogeneous() const;
-    #endif
    
    enum {
      SizeMinusOne = SizeAtCompileTime==Dynamic ? Dynamic : SizeAtCompileTime-1
@@ -458,49 +469,15 @@ template<typename Derived> class MatrixBase
    const MatrixSquareRootReturnValue<Derived> sqrt() const;
    const MatrixLogarithmReturnValue<Derived> log() const;
    const MatrixPowerReturnValue<Derived> pow(const RealScalar& p) const;
-
-#ifdef EIGEN2_SUPPORT
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    Derived& operator+=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
-                                      EvalBeforeAssigningBit>& other);
-
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    Derived& operator-=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
-                                      EvalBeforeAssigningBit>& other);
-
-    /** \deprecated because .lazy() is deprecated
-      * Overloaded for cache friendly product evaluation */
-    template<typename OtherDerived>
-    Derived& lazyAssign(const Flagged<OtherDerived, 0, EvalBeforeAssigningBit>& other)
-    { return lazyAssign(other._expression()); }
-
-    template<unsigned int Added>
-    const Flagged<Derived, Added, 0> marked() const;
-    const Flagged<Derived, 0, EvalBeforeAssigningBit> lazy() const;
-
-    inline const Cwise<Derived> cwise() const;
-    inline Cwise<Derived> cwise();
-
-    VectorBlock<Derived> start(Index size);
-    const VectorBlock<const Derived> start(Index size) const;
-    VectorBlock<Derived> end(Index size);
-    const VectorBlock<const Derived> end(Index size) const;
-    template<int Size> VectorBlock<Derived,Size> start();
-    template<int Size> const VectorBlock<const Derived,Size> start() const;
-    template<int Size> VectorBlock<Derived,Size> end();
-    template<int Size> const VectorBlock<const Derived,Size> end() const;
-
-    Minor<Derived> minor(Index row, Index col);
-    const Minor<Derived> minor(Index row, Index col) const;
-#endif
+    const MatrixComplexPowerReturnValue<Derived> pow(const std::complex<RealScalar>& p) const;

  protected:
-    MatrixBase() : Base() {}
+    EIGEN_DEVICE_FUNC MatrixBase() : Base() {}

  private:
-    explicit MatrixBase(int);
-    MatrixBase(int,int);
-    template<typename OtherDerived> explicit MatrixBase(const MatrixBase<OtherDerived>&);
+    EIGEN_DEVICE_FUNC explicit MatrixBase(int);
+    EIGEN_DEVICE_FUNC MatrixBase(int,int);
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC explicit MatrixBase(const MatrixBase<OtherDerived>&);
  protected:
    // mixing arrays and matrices is not legal
    template<typename OtherDerived> Derived& operator+=(const ArrayBase<OtherDerived>& )
@@ -510,6 +487,51 @@ template<typename Derived> class MatrixBase
    {EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar))==-1,YOU_CANNOT_MIX_ARRAYS_AND_MATRICES); return *this;}
 };

+
+/***************************************************************************
+* Implementation of matrix base methods
+***************************************************************************/
+
+/** replaces \c *this by \c *this * \a other.
+  *
+  * \returns a reference to \c *this
+  *
+  * Example: \include MatrixBase_applyOnTheRight.cpp
+  * Output: \verbinclude MatrixBase_applyOnTheRight.out
+  */
+template<typename Derived>
+template<typename OtherDerived>
+inline Derived&
+MatrixBase<Derived>::operator*=(const EigenBase<OtherDerived> &other)
+{
+  other.derived().applyThisOnTheRight(derived());
+  return derived();
+}
+
+/** replaces \c *this by \c *this * \a other. It is equivalent to MatrixBase::operator*=().
+  *
+  * Example: \include MatrixBase_applyOnTheRight.cpp
+  * Output: \verbinclude MatrixBase_applyOnTheRight.out
+  */
+template<typename Derived>
+template<typename OtherDerived>
+inline void MatrixBase<Derived>::applyOnTheRight(const EigenBase<OtherDerived> &other)
+{
+  other.derived().applyThisOnTheRight(derived());
+}
+
+/** replaces \c *this by \a other * \c *this.
+  *
+  * Example: \include MatrixBase_applyOnTheLeft.cpp
+  * Output: \verbinclude MatrixBase_applyOnTheLeft.out
+  */
+template<typename Derived>
+template<typename OtherDerived>
+inline void MatrixBase<Derived>::applyOnTheLeft(const EigenBase<OtherDerived> &other)
+{
+  other.derived().applyThisOnTheLeft(derived());
+}
+
 } // end namespace Eigen

 #endif // EIGEN_MATRIXBASE_H
--- a/Eigen/src/Core/NoAlias.h
+++ b/Eigen/src/Core/NoAlias.h
@@ -37,11 +37,13 @@ class NoAlias
    /** Behaves like MatrixBase::lazyAssign(other)
      * \sa MatrixBase::lazyAssign() */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator=(const StorageBase<OtherDerived>& other)
    { return internal::assign_selector<ExpressionType,OtherDerived,false>::run(m_expression,other.derived()); }

    /** \sa MatrixBase::operator+= */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator+=(const StorageBase<OtherDerived>& other)
    {
      typedef SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, ExpressionType, OtherDerived> SelfAdder;
@@ -54,6 +56,7 @@ class NoAlias

    /** \sa MatrixBase::operator-= */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator-=(const StorageBase<OtherDerived>& other)
    {
      typedef SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, ExpressionType, OtherDerived> SelfAdder;
@@ -66,10 +69,12 @@ class NoAlias

 #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename ProductDerived, typename Lhs, typename Rhs>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator+=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
    { other.derived().addTo(m_expression); return m_expression; }

    template<typename ProductDerived, typename Lhs, typename Rhs>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator-=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
    { other.derived().subTo(m_expression); return m_expression; }

@@ -78,6 +83,7 @@ class NoAlias
    { return m_expression.derived() += CoeffBasedProduct<Lhs,Rhs,NestByRefBit>(other.lhs(), other.rhs()); }

    template<typename Lhs, typename Rhs, int NestingFlags>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator-=(const CoeffBasedProduct<Lhs,Rhs,NestingFlags>& other)
    { return m_expression.derived() -= CoeffBasedProduct<Lhs,Rhs,NestByRefBit>(other.lhs(), other.rhs()); }
    
@@ -86,6 +92,7 @@ class NoAlias
    { return m_expression = func; }
 #endif

+    EIGEN_DEVICE_FUNC
    ExpressionType& expression() const
    {
      return m_expression;
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -68,7 +68,16 @@ template<typename T> struct GenericNumTraits
                   >::type NonInteger;
  typedef T Nested;

-  static inline Real epsilon() { return std::numeric_limits<T>::epsilon(); }
+  EIGEN_DEVICE_FUNC
+  static inline Real epsilon()
+  {
+    #if defined(__CUDA_ARCH__)
+    return internal::device::numeric_limits<T>::epsilon();
+    #else
+    return std::numeric_limits<T>::epsilon();
+    #endif
+  }
+  EIGEN_DEVICE_FUNC
  static inline Real dummy_precision()
  {
    // make sure to override this for floating-point types
@@ -76,13 +85,6 @@ template<typename T> struct GenericNumTraits
  }
  static inline T highest() { return (std::numeric_limits<T>::max)(); }
  static inline T lowest()  { return IsInteger ? (std::numeric_limits<T>::min)() : (-(std::numeric_limits<T>::max)()); }
-  
-#ifdef EIGEN2_SUPPORT
-  enum {
-    HasFloatingPoint = !IsInteger
-  };
-  typedef NonInteger FloatingPoint;
-#endif
 };

 template<typename T> struct NumTraits : GenericNumTraits<T>
@@ -91,11 +93,13 @@ template<typename T> struct NumTraits : GenericNumTraits<T>
 template<> struct NumTraits<float>
  : GenericNumTraits<float>
 {
+  EIGEN_DEVICE_FUNC
  static inline float dummy_precision() { return 1e-5f; }
 };

 template<> struct NumTraits<double> : GenericNumTraits<double>
 {
+  EIGEN_DEVICE_FUNC
  static inline double dummy_precision() { return 1e-12; }
 };

--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@@ -66,11 +66,11 @@ class PermutationBase : public EigenBase<Derived>
      MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
      MaxColsAtCompileTime = Traits::MaxColsAtCompileTime
    };
-    typedef typename Traits::Scalar Scalar;
+    typedef typename Traits::StorageIndexType StorageIndexType;
    typedef typename Traits::Index Index;
-    typedef Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime,0,MaxRowsAtCompileTime,MaxColsAtCompileTime>
+    typedef Matrix<StorageIndexType,RowsAtCompileTime,ColsAtCompileTime,0,MaxRowsAtCompileTime,MaxColsAtCompileTime>
            DenseMatrixType;
-    typedef PermutationMatrix<IndicesType::SizeAtCompileTime,IndicesType::MaxSizeAtCompileTime,Index>
+    typedef PermutationMatrix<IndicesType::SizeAtCompileTime,IndicesType::MaxSizeAtCompileTime,StorageIndexType>
            PlainPermutationType;
    using Base::derived;
    #endif
@@ -147,7 +147,7 @@ class PermutationBase : public EigenBase<Derived>
    /** Sets *this to be the identity permutation matrix */
    void setIdentity()
    {
-      for(Index i = 0; i < size(); ++i)
+      for(StorageIndexType i = 0; i < size(); ++i)
        indices().coeffRef(i) = i;
    }

@@ -173,8 +173,8 @@ class PermutationBase : public EigenBase<Derived>
      eigen_assert(i>=0 && j>=0 && i<size() && j<size());
      for(Index k = 0; k < size(); ++k)
      {
-        if(indices().coeff(k) == i) indices().coeffRef(k) = j;
-        else if(indices().coeff(k) == j) indices().coeffRef(k) = i;
+        if(indices().coeff(k) == i) indices().coeffRef(k) = StorageIndexType(j);
+        else if(indices().coeff(k) == j) indices().coeffRef(k) = StorageIndexType(i);
      }
      return derived();
    }
@@ -262,7 +262,7 @@ class PermutationBase : public EigenBase<Derived>
  *
  * \param SizeAtCompileTime the number of rows/cols, or Dynamic
  * \param MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
-  * \param IndexType the interger type of the indices
+  * \param StorageIndexType the integer type of the indices
  *
  * This class represents a permutation matrix, internally stored as a vector of integers.
  *
@@ -270,17 +270,18 @@ class PermutationBase : public EigenBase<Derived>
  */

 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType> >
- : traits<Matrix<IndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndexType>
+struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndexType> >
+ : traits<Matrix<_StorageIndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
 {
-  typedef IndexType Index;
-  typedef Matrix<IndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+  typedef Matrix<_StorageIndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+  typedef typename IndicesType::Index Index;
+  typedef _StorageIndexType StorageIndexType;
 };
 }

-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndexType>
+class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndexType> >
 {
    typedef PermutationBase<PermutationMatrix> Base;
    typedef internal::traits<PermutationMatrix> Traits;
@@ -288,6 +289,8 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile

    #ifndef EIGEN_PARSED_BY_DOXYGEN
    typedef typename Traits::IndicesType IndicesType;
+    typedef typename Traits::StorageIndexType StorageIndexType;
+    typedef typename Traits::Index Index;
    #endif

    inline PermutationMatrix()
@@ -295,7 +298,7 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile

    /** Constructs an uninitialized permutation matrix of given size.
      */
-    inline PermutationMatrix(int size) : m_indices(size)
+    inline PermutationMatrix(Index size) : m_indices(size)
    {}

    /** Copy constructor. */
@@ -384,18 +387,19 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile


 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
-struct traits<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess> >
- : traits<Matrix<IndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndexType, int _PacketAccess>
+struct traits<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndexType>,_PacketAccess> >
+ : traits<Matrix<_StorageIndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
 {
-  typedef IndexType Index;
-  typedef Map<const Matrix<IndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1>, _PacketAccess> IndicesType;
+  typedef Map<const Matrix<_StorageIndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1>, _PacketAccess> IndicesType;
+  typedef typename IndicesType::Index Index;
+  typedef _StorageIndexType StorageIndexType;
 };
 }

-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
-class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess>
-  : public PermutationBase<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndexType, int _PacketAccess>
+class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndexType>,_PacketAccess>
+  : public PermutationBase<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndexType>,_PacketAccess> >
 {
    typedef PermutationBase<Map> Base;
    typedef internal::traits<Map> Traits;
@@ -403,14 +407,15 @@ class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,

    #ifndef EIGEN_PARSED_BY_DOXYGEN
    typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
+    typedef typename IndicesType::Scalar StorageIndexType;
+    typedef typename IndicesType::Index  Index;
    #endif

-    inline Map(const Index* indicesPtr)
+    inline Map(const StorageIndexType* indicesPtr)
      : m_indices(indicesPtr)
    {}

-    inline Map(const Index* indicesPtr, Index size)
+    inline Map(const StorageIndexType* indicesPtr, Index size)
      : m_indices(indicesPtr,size)
    {}

@@ -466,7 +471,8 @@ struct traits<PermutationWrapper<_IndicesType> >
 {
  typedef PermutationStorage StorageKind;
  typedef typename _IndicesType::Scalar Scalar;
-  typedef typename _IndicesType::Scalar Index;
+  typedef typename _IndicesType::Scalar StorageIndexType;
+  typedef typename _IndicesType::Index Index;
  typedef _IndicesType IndicesType;
  enum {
    RowsAtCompileTime = _IndicesType::SizeAtCompileTime,
@@ -553,7 +559,8 @@ struct permut_matrix_product_retval
    template<typename Dest> inline void evalTo(Dest& dst) const
    {
      const Index n = Side==OnTheLeft ? rows() : cols();
-
+      // FIXME we need an is_same for expression that is not sensitive to constness. For instance
+      // is_same_xpr<Block<const Matrix>, Block<Matrix> >::value should be true.
      if(is_same<MatrixTypeNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_matrix))
      {
        // apply the permutation inplace
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -28,6 +28,7 @@ namespace internal {

 template<int MaxSizeAtCompileTime> struct check_rows_cols_for_overflow {
  template<typename Index>
+  EIGEN_DEVICE_FUNC
  static EIGEN_ALWAYS_INLINE void run(Index, Index)
  {
  }
@@ -35,6 +36,7 @@ template<int MaxSizeAtCompileTime> struct check_rows_cols_for_overflow {

 template<> struct check_rows_cols_for_overflow<Dynamic> {
  template<typename Index>
+  EIGEN_DEVICE_FUNC
  static EIGEN_ALWAYS_INLINE void run(Index rows, Index cols)
  {
    // http://hg.mozilla.org/mozilla-central/file/6c8a909977d3/xpcom/ds/CheckedInt.h#l242
@@ -47,7 +49,10 @@ template<> struct check_rows_cols_for_overflow<Dynamic> {
  }
 };

-template <typename Derived, typename OtherDerived = Derived, bool IsVector = bool(Derived::IsVectorAtCompileTime)> struct conservative_resize_like_impl;
+template <typename Derived,
+          typename OtherDerived = Derived,
+          bool IsVector = bool(Derived::IsVectorAtCompileTime) && bool(OtherDerived::IsVectorAtCompileTime)>
+struct conservative_resize_like_impl;

 template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers> struct matrix_swap_impl;

@@ -126,12 +131,17 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    enum { NeedsToAlign = SizeAtCompileTime != Dynamic && (internal::traits<Derived>::Flags & AlignedBit) != 0 };
    EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)

+    EIGEN_DEVICE_FUNC
    Base& base() { return *static_cast<Base*>(this); }
+    EIGEN_DEVICE_FUNC
    const Base& base() const { return *static_cast<const Base*>(this); }

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index rows() const { return m_storage.rows(); }
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index cols() const { return m_storage.cols(); }

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar& coeff(Index rowId, Index colId) const
    {
      if(Flags & RowMajorBit)
@@ -140,11 +150,13 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
        return m_storage.data()[rowId + colId * m_storage.rows()];
    }

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
    {
      return m_storage.data()[index];
    }

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar& coeffRef(Index rowId, Index colId)
    {
      if(Flags & RowMajorBit)
@@ -153,11 +165,13 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
        return m_storage.data()[rowId + colId * m_storage.rows()];
    }

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
    {
      return m_storage.data()[index];
    }

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar& coeffRef(Index rowId, Index colId) const
    {
      if(Flags & RowMajorBit)
@@ -166,6 +180,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
        return m_storage.data()[rowId + colId * m_storage.rows()];
    }

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar& coeffRef(Index index) const
    {
      return m_storage.data()[index];
@@ -229,6 +244,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * \sa resize(Index) for vectors, resize(NoChange_t, Index), resize(Index, NoChange_t)
      */
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void resize(Index nbRows, Index nbCols)
    {
      eigen_assert(   EIGEN_IMPLIES(RowsAtCompileTime!=Dynamic,nbRows==RowsAtCompileTime)
@@ -259,6 +275,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * \sa resize(Index,Index), resize(NoChange_t, Index), resize(Index, NoChange_t)
      */
+    EIGEN_DEVICE_FUNC
    inline void resize(Index size)
    {
      EIGEN_STATIC_ASSERT_VECTOR_ONLY(PlainObjectBase)
@@ -283,6 +300,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * \sa resize(Index,Index)
      */
+    EIGEN_DEVICE_FUNC
    inline void resize(NoChange_t, Index nbCols)
    {
      resize(rows(), nbCols);
@@ -296,6 +314,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * \sa resize(Index,Index)
      */
+    EIGEN_DEVICE_FUNC
    inline void resize(Index nbRows, NoChange_t)
    {
      resize(nbRows, cols());
@@ -309,6 +328,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * remain row-vectors and vectors remain vectors.
      */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE void resizeLike(const EigenBase<OtherDerived>& _other)
    {
      const OtherDerived& other = _other.derived();
@@ -336,6 +356,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * Matrices are resized relative to the top-left element. In case values need to be 
      * appended to the matrix they will be uninitialized.
      */
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void conservativeResize(Index nbRows, Index nbCols)
    {
      internal::conservative_resize_like_impl<Derived>::run(*this, nbRows, nbCols);
@@ -348,6 +369,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * In case the matrix is growing, new rows will be uninitialized.
      */
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void conservativeResize(Index nbRows, NoChange_t)
    {
      // Note: see the comment in conservativeResize(Index,Index)
@@ -361,6 +383,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * In case the matrix is growing, new columns will be uninitialized.
      */
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void conservativeResize(NoChange_t, Index nbCols)
    {
      // Note: see the comment in conservativeResize(Index,Index)
@@ -375,6 +398,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * When values are appended, they will be uninitialized.
      */
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void conservativeResize(Index size)
    {
      internal::conservative_resize_like_impl<Derived>::run(*this, size);
@@ -390,6 +414,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * appended to the matrix they will copied from \c other.
      */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void conservativeResizeLike(const DenseBase<OtherDerived>& other)
    {
      internal::conservative_resize_like_impl<Derived,OtherDerived>::run(*this, other);
@@ -398,6 +423,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    /** This is a special case of the templated operator=. Its purpose is to
      * prevent a default operator= from hiding the templated operator=.
      */
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Derived& operator=(const PlainObjectBase& other)
    {
      return _set(other);
@@ -405,6 +431,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type

    /** \sa MatrixBase::lazyAssign() */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Derived& lazyAssign(const DenseBase<OtherDerived>& other)
    {
      _resize_to_match(other);
@@ -412,12 +439,14 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    }

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Derived& operator=(const ReturnByValue<OtherDerived>& func)
    {
      resize(func.rows(), func.cols());
      return Base::operator=(func);
    }

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE PlainObjectBase() : m_storage()
    {
 //       _check_template_params();
@@ -427,6 +456,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
 #ifndef EIGEN_PARSED_BY_DOXYGEN
    // FIXME is it still needed ?
    /** \internal */
+    EIGEN_DEVICE_FUNC
    PlainObjectBase(internal::constructor_without_unaligned_array_assert)
      : m_storage(internal::constructor_without_unaligned_array_assert())
    {
@@ -434,6 +464,23 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    }
 #endif

+#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    PlainObjectBase(PlainObjectBase&& other)
+      : m_storage( std::move(other.m_storage) )
+    {
+    }
+
+    EIGEN_DEVICE_FUNC
+    PlainObjectBase& operator=(PlainObjectBase&& other)
+    {
+      using std::swap;
+      swap(m_storage, other.m_storage);
+      return *this;
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE PlainObjectBase(Index a_size, Index nbRows, Index nbCols)
      : m_storage(a_size, nbRows, nbCols)
    {
@@ -444,6 +491,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    /** \copydoc MatrixBase::operator=(const EigenBase<OtherDerived>&)
      */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived> &other)
    {
      _resize_to_match(other);
@@ -453,6 +501,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type

    /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE PlainObjectBase(const EigenBase<OtherDerived> &other)
      : m_storage(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
    {
@@ -535,16 +584,16 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    //@}

    using Base::setConstant;
-    Derived& setConstant(Index size, const Scalar& value);
-    Derived& setConstant(Index rows, Index cols, const Scalar& value);
+    EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& value);
+    EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& value);

    using Base::setZero;
-    Derived& setZero(Index size);
-    Derived& setZero(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setZero(Index size);
+    EIGEN_DEVICE_FUNC Derived& setZero(Index rows, Index cols);

    using Base::setOnes;
-    Derived& setOnes(Index size);
-    Derived& setOnes(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setOnes(Index size);
+    EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, Index cols);

    using Base::setRandom;
    Derived& setRandom(Index size);
@@ -563,6 +612,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * remain row-vectors and vectors remain vectors.
      */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE void _resize_to_match(const EigenBase<OtherDerived>& other)
    {
      #ifdef EIGEN_NO_AUTOMATIC_RESIZING
@@ -590,6 +640,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * \internal
      */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE Derived& _set(const DenseBase<OtherDerived>& other)
    {
      _set_selector(other.derived(), typename internal::conditional<static_cast<bool>(int(OtherDerived::Flags) & EvalBeforeAssigningBit), internal::true_type, internal::false_type>::type());
@@ -597,9 +648,11 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    }

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE void _set_selector(const OtherDerived& other, const internal::true_type&) { _set_noalias(other.eval()); }

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE void _set_selector(const OtherDerived& other, const internal::false_type&) { _set_noalias(other); }

    /** \internal Like _set() but additionally makes the assumption that no aliasing effect can happen (which
@@ -608,6 +661,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * \sa operator=(const MatrixBase<OtherDerived>&), _set()
      */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE Derived& _set_noalias(const DenseBase<OtherDerived>& other)
    {
      // I don't think we need this resize call since the lazyAssign will anyways resize
@@ -619,6 +673,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    }

    template<typename T0, typename T1>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init2(Index nbRows, Index nbCols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
    {
      EIGEN_STATIC_ASSERT(bool(NumTraits<T0>::IsInteger) &&
@@ -626,13 +681,91 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
      resize(nbRows,nbCols);
    }
+    
    template<typename T0, typename T1>
+    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE void _init2(const Scalar& val0, const Scalar& val1, typename internal::enable_if<Base::SizeAtCompileTime==2,T0>::type* = 0)
    {
      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
      m_storage.data()[0] = val0;
      m_storage.data()[1] = val1;
    }
+    
+    template<typename T0, typename T1>
+    EIGEN_DEVICE_FUNC 
+    EIGEN_STRONG_INLINE void _init2(const Index& val0, const Index& val1,
+                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
+                                                                  && (internal::is_same<T0,Index>::value)
+                                                                  && (internal::is_same<T1,Index>::value)
+                                                                  && Base::SizeAtCompileTime==2,T1>::type* = 0)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
+      m_storage.data()[0] = Scalar(val0);
+      m_storage.data()[1] = Scalar(val1);
+    }
+
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(Index size, typename internal::enable_if<Base::SizeAtCompileTime!=1 || !internal::is_convertible<T, Scalar>::value,T>::type* = 0)
+    {
+      // NOTE MSVC 2008 complains if we directly put bool(NumTraits<T>::IsInteger) as the EIGEN_STATIC_ASSERT argument.
+      const bool is_integer = NumTraits<T>::IsInteger;
+      EIGEN_STATIC_ASSERT(is_integer,
+                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
+      resize(size);
+    }
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if<Base::SizeAtCompileTime==1 && internal::is_convertible<T, Scalar>::value,T>::type* = 0)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
+      m_storage.data()[0] = val0;
+    }
+    
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Index& val0,
+                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
+                                                                  && (internal::is_same<Index,T>::value)
+                                                                  && Base::SizeAtCompileTime==1
+                                                                  && internal::is_convertible<T, Scalar>::value,T*>::type* = 0)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
+      m_storage.data()[0] = Scalar(val0);
+    }
+
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Scalar* data){
+      this->_set_noalias(ConstMapType(data));
+    }
+
+    template<typename T, typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const DenseBase<OtherDerived>& other){
+      this->_set_noalias(other);
+    }
+
+    template<typename T, typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const EigenBase<OtherDerived>& other){
+      this->derived() = other;
+    }
+
+    template<typename T, typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const ReturnByValue<OtherDerived>& other)
+    {
+      resize(other.rows(), other.cols());
+      other.evalTo(this->derived());
+    }
+
+    template<typename T, typename OtherDerived, int ColsAtCompileTime>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
+    {
+      this->derived() = r;
+    }

    template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
    friend struct internal::matrix_swap_impl;
@@ -641,6 +774,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * data pointers.
      */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    void _swap(DenseBase<OtherDerived> const & other)
    {
      enum { SwapPointers = internal::is_same<Derived, OtherDerived>::value && Base::SizeAtCompileTime==Dynamic };
@@ -649,6 +783,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type

  public:
 #ifndef EIGEN_PARSED_BY_DOXYGEN
+    EIGEN_DEVICE_FUNC 
    static EIGEN_STRONG_INLINE void _check_template_params()
    {
      EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (Options&RowMajor)==RowMajor)
@@ -668,8 +803,10 @@ private:
    enum { ThisConstantIsPrivateInPlainObjectBase };
 };

+namespace internal {
+
 template <typename Derived, typename OtherDerived, bool IsVector>
-struct internal::conservative_resize_like_impl
+struct conservative_resize_like_impl
 {
  typedef typename Derived::Index Index;
  static void run(DenseBase<Derived>& _this, Index rows, Index cols)
@@ -729,11 +866,14 @@ struct internal::conservative_resize_like_impl
  }
 };

-namespace internal {
-
+// Here, the specialization for vectors inherits from the general matrix case
+// to allow calling .conservativeResize(rows,cols) on vectors.
 template <typename Derived, typename OtherDerived>
 struct conservative_resize_like_impl<Derived,OtherDerived,true>
+  : conservative_resize_like_impl<Derived,OtherDerived,false>
 {
+  using conservative_resize_like_impl<Derived,OtherDerived,false>::run;
+  
  typedef typename Derived::Index Index;
  static void run(DenseBase<Derived>& _this, Index size)
  {
@@ -760,6 +900,7 @@ struct conservative_resize_like_impl<Derived,OtherDerived,true>
 template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
 struct matrix_swap_impl
 {
+  EIGEN_DEVICE_FUNC
  static inline void run(MatrixTypeA& a, MatrixTypeB& b)
  {
    a.base().swap(b);
@@ -769,6 +910,7 @@ struct matrix_swap_impl
 template<typename MatrixTypeA, typename MatrixTypeB>
 struct matrix_swap_impl<MatrixTypeA, MatrixTypeB, true>
 {
+  EIGEN_DEVICE_FUNC
  static inline void run(MatrixTypeA& a, MatrixTypeB& b)
  {
    static_cast<typename MatrixTypeA::Base&>(a).m_storage.swap(static_cast<typename MatrixTypeB::Base&>(b).m_storage);
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -0,0 +1,107 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PRODUCT_H
+#define EIGEN_PRODUCT_H
+
+namespace Eigen {
+
+template<typename Lhs, typename Rhs> class Product;
+template<typename Lhs, typename Rhs, typename StorageKind> class ProductImpl;
+
+/** \class Product
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the product of two arbitrary matrices or vectors
+  *
+  * \param Lhs the type of the left-hand side expression
+  * \param Rhs the type of the right-hand side expression
+  *
+  * This class represents an expression of the product of two arbitrary matrices.
+  *
+  */
+
+// Use ProductReturnType to get correct traits, in particular vectorization flags
+namespace internal {
+template<typename Lhs, typename Rhs>
+struct traits<Product<Lhs, Rhs> >
+  : traits<typename ProductReturnType<Lhs, Rhs>::Type>
+{ 
+  // We want A+B*C to be of type Product<Matrix, Sum> and not Product<Matrix, Matrix>
+  // TODO: This flag should eventually go in a separate evaluator traits class
+  enum {
+    Flags = traits<typename ProductReturnType<Lhs, Rhs>::Type>::Flags & ~(EvalBeforeNestingBit | DirectAccessBit)
+  };
+};
+} // end namespace internal
+
+
+template<typename Lhs, typename Rhs>
+class Product : public ProductImpl<Lhs,Rhs,typename internal::promote_storage_type<typename internal::traits<Lhs>::StorageKind,
+                                                                            typename internal::traits<Rhs>::StorageKind>::ret>
+{
+  public:
+    
+    typedef typename ProductImpl<
+        Lhs, Rhs,
+        typename internal::promote_storage_type<typename Lhs::StorageKind,
+                                                typename Rhs::StorageKind>::ret>::Base Base;
+    EIGEN_GENERIC_PUBLIC_INTERFACE(Product)
+
+    typedef typename Lhs::Nested LhsNested;
+    typedef typename Rhs::Nested RhsNested;
+    typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
+    typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
+
+    Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs)
+    {
+      eigen_assert(lhs.cols() == rhs.rows()
+        && "invalid matrix product"
+        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
+    }
+
+    inline Index rows() const { return m_lhs.rows(); }
+    inline Index cols() const { return m_rhs.cols(); }
+
+    const LhsNestedCleaned& lhs() const { return m_lhs; }
+    const RhsNestedCleaned& rhs() const { return m_rhs; }
+
+  protected:
+
+    LhsNested m_lhs;
+    RhsNested m_rhs;
+};
+
+template<typename Lhs, typename Rhs>
+class ProductImpl<Lhs,Rhs,Dense> : public internal::dense_xpr_base<Product<Lhs,Rhs> >::type
+{
+    typedef Product<Lhs, Rhs> Derived;
+  public:
+
+    typedef typename internal::dense_xpr_base<Product<Lhs, Rhs> >::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
+};
+
+/***************************************************************************
+* Implementation of matrix base methods
+***************************************************************************/
+
+
+/** \internal used to test the evaluator only
+  */
+template<typename Lhs,typename Rhs>
+const Product<Lhs,Rhs>
+prod(const Lhs& lhs, const Rhs& rhs)
+{
+  return Product<Lhs,Rhs>(lhs,rhs);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_PRODUCT_H
--- a/Eigen/src/Core/ProductBase.h
+++ b/Eigen/src/Core/ProductBase.h
@@ -131,17 +131,13 @@ class ProductBase : public MatrixBase<Derived>
    const Diagonal<FullyLazyCoeffBaseProductType,Dynamic> diagonal(Index index) const
    { return FullyLazyCoeffBaseProductType(m_lhs, m_rhs).diagonal(index); }

-    // restrict coeff accessors to 1x1 expressions. No need to care about mutators here since this isnt a Lvalue expression
+    // restrict coeff accessors to 1x1 expressions. No need to care about mutators here since this isn't an Lvalue expression
    typename Base::CoeffReturnType coeff(Index row, Index col) const
    {
-#ifdef EIGEN2_SUPPORT
-      return lhs().row(row).cwiseProduct(rhs().col(col).transpose()).sum();
-#else
      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
      eigen_assert(this->rows() == 1 && this->cols() == 1);
      Matrix<Scalar,1,1> result = *this;
      return result.coeff(row,col);
-#endif
    }

    typename Base::CoeffReturnType coeff(Index i) const
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -0,0 +1,411 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_PRODUCTEVALUATORS_H
+#define EIGEN_PRODUCTEVALUATORS_H
+
+namespace Eigen {
+  
+namespace internal {
+  
+// We can evaluate the product either all at once, like GeneralProduct and its evalTo() function, or
+// traverse the matrix coefficient by coefficient, like CoeffBasedProduct.  Use the existing logic
+// in ProductReturnType to decide.
+
+template<typename XprType, typename ProductType>
+struct product_evaluator_dispatcher;
+
+template<typename Lhs, typename Rhs>
+struct evaluator_impl<Product<Lhs, Rhs> >
+  : product_evaluator_dispatcher<Product<Lhs, Rhs>, typename ProductReturnType<Lhs, Rhs>::Type> 
+{
+  typedef Product<Lhs, Rhs> XprType;
+  typedef product_evaluator_dispatcher<XprType, typename ProductReturnType<Lhs, Rhs>::Type> Base;
+
+  evaluator_impl(const XprType& xpr) : Base(xpr) 
+  { }
+};
+
+template<typename XprType, typename ProductType>
+struct product_evaluator_traits_dispatcher;
+
+template<typename Lhs, typename Rhs>
+struct evaluator_traits<Product<Lhs, Rhs> >
+  : product_evaluator_traits_dispatcher<Product<Lhs, Rhs>, typename ProductReturnType<Lhs, Rhs>::Type> 
+{ 
+  static const int AssumeAliasing = 1;
+};
+
+// Case 1: Evaluate all at once
+//
+// We can view the GeneralProduct class as a part of the product evaluator. 
+// Four sub-cases: InnerProduct, OuterProduct, GemmProduct and GemvProduct.
+// InnerProduct is special because GeneralProduct does not have an evalTo() method in this case.
+
+template<typename Lhs, typename Rhs>
+struct product_evaluator_traits_dispatcher<Product<Lhs, Rhs>, GeneralProduct<Lhs, Rhs, InnerProduct> > 
+{
+  static const int HasEvalTo = 0;
+};
+
+template<typename Lhs, typename Rhs>
+struct product_evaluator_dispatcher<Product<Lhs, Rhs>, GeneralProduct<Lhs, Rhs, InnerProduct> > 
+  : public evaluator<typename Product<Lhs, Rhs>::PlainObject>::type
+{
+  typedef Product<Lhs, Rhs> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef typename evaluator<PlainObject>::type evaluator_base;
+
+  // TODO: Computation is too early (?)
+  product_evaluator_dispatcher(const XprType& xpr) : evaluator_base(m_result)
+  {
+    m_result.coeffRef(0,0) = (xpr.lhs().transpose().cwiseProduct(xpr.rhs())).sum();
+  }
+  
+protected:  
+  PlainObject m_result;
+};
+
+// For the other three subcases, simply call the evalTo() method of GeneralProduct
+// TODO: GeneralProduct should take evaluators, not expression objects.
+
+template<typename Lhs, typename Rhs, int ProductType>
+struct product_evaluator_traits_dispatcher<Product<Lhs, Rhs>, GeneralProduct<Lhs, Rhs, ProductType> > 
+{
+  static const int HasEvalTo = 1;
+};
+
+template<typename Lhs, typename Rhs, int ProductType>
+struct product_evaluator_dispatcher<Product<Lhs, Rhs>, GeneralProduct<Lhs, Rhs, ProductType> > 
+{
+  typedef Product<Lhs, Rhs> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef typename evaluator<PlainObject>::type evaluator_base;
+  
+  product_evaluator_dispatcher(const XprType& xpr) : m_xpr(xpr)
+  { }
+  
+  template<typename DstEvaluatorType, typename DstXprType>
+  void evalTo(DstEvaluatorType /* not used */, DstXprType& dst) const
+  {
+    dst.resize(m_xpr.rows(), m_xpr.cols());
+    GeneralProduct<Lhs, Rhs, ProductType>(m_xpr.lhs(), m_xpr.rhs()).evalTo(dst);
+  }
+  
+protected: 
+  const XprType& m_xpr;
+};
+
+// Case 2: Evaluate coeff by coeff
+//
+// This is mostly taken from CoeffBasedProduct.h
+// The main difference is that we add an extra argument to the etor_product_*_impl::run() function
+// for the inner dimension of the product, because evaluator object do not know their size.
+
+template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
+struct etor_product_coeff_impl;
+
+template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl;
+
+template<typename Lhs, typename Rhs, typename LhsNested, typename RhsNested, int Flags>
+struct product_evaluator_traits_dispatcher<Product<Lhs, Rhs>, CoeffBasedProduct<LhsNested, RhsNested, Flags> >
+{
+  static const int HasEvalTo = 0;
+};
+
+template<typename Lhs, typename Rhs, typename LhsNested, typename RhsNested, int Flags>
+struct product_evaluator_dispatcher<Product<Lhs, Rhs>, CoeffBasedProduct<LhsNested, RhsNested, Flags> >
+  : evaluator_impl_base<Product<Lhs, Rhs> >
+{
+  typedef Product<Lhs, Rhs> XprType;
+  typedef CoeffBasedProduct<LhsNested, RhsNested, Flags> CoeffBasedProductType;
+
+  product_evaluator_dispatcher(const XprType& xpr) 
+    : m_lhsImpl(xpr.lhs()), 
+      m_rhsImpl(xpr.rhs()),  
+      m_innerDim(xpr.lhs().cols())
+  { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketScalar PacketScalar;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  // Everything below here is taken from CoeffBasedProduct.h
+
+  enum {
+    RowsAtCompileTime = traits<CoeffBasedProductType>::RowsAtCompileTime,
+    PacketSize = packet_traits<Scalar>::size,
+    InnerSize  = traits<CoeffBasedProductType>::InnerSize,
+    CoeffReadCost = traits<CoeffBasedProductType>::CoeffReadCost,
+    Unroll = CoeffReadCost != Dynamic && CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
+    CanVectorizeInner = traits<CoeffBasedProductType>::CanVectorizeInner
+  };
+
+  typedef typename evaluator<Lhs>::type LhsEtorType;
+  typedef typename evaluator<Rhs>::type RhsEtorType;
+  typedef etor_product_coeff_impl<CanVectorizeInner ? InnerVectorizedTraversal : DefaultTraversal,
+                                  Unroll ? InnerSize-1 : Dynamic,
+                                  LhsEtorType, RhsEtorType, Scalar> CoeffImpl;
+
+  const CoeffReturnType coeff(Index row, Index col) const
+  {
+    Scalar res;
+    CoeffImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
+    return res;
+  }
+
+  /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
+   * which is why we don't set the LinearAccessBit.
+   */
+  const CoeffReturnType coeff(Index index) const
+  {
+    Scalar res;
+    const Index row = RowsAtCompileTime == 1 ? 0 : index;
+    const Index col = RowsAtCompileTime == 1 ? index : 0;
+    CoeffImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
+    return res;
+  }
+
+  template<int LoadMode>
+  const PacketReturnType packet(Index row, Index col) const
+  {
+    PacketScalar res;
+    typedef etor_product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
+				     Unroll ? InnerSize-1 : Dynamic,
+				     LhsEtorType, RhsEtorType, PacketScalar, LoadMode> PacketImpl;
+    PacketImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
+    return res;
+  }
+
+protected:
+  typename evaluator<Lhs>::type m_lhsImpl;
+  typename evaluator<Rhs>::type m_rhsImpl;
+
+  // TODO: Get rid of m_innerDim if known at compile time
+  Index m_innerDim;
+};
+
+/***************************************************************************
+* Normal product .coeff() implementation (with meta-unrolling)
+***************************************************************************/
+
+/**************************************
+*** Scalar path  - no vectorization ***
+**************************************/
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
+struct etor_product_coeff_impl<DefaultTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, RetScalar &res)
+  {
+    etor_product_coeff_impl<DefaultTraversal, UnrollingIndex-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, innerDim, res);
+    res += lhs.coeff(row, UnrollingIndex) * rhs.coeff(UnrollingIndex, col);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename RetScalar>
+struct etor_product_coeff_impl<DefaultTraversal, 0, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, RetScalar &res)
+  {
+    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename RetScalar>
+struct etor_product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, RetScalar& res)
+  {
+    eigen_assert(innerDim>0 && "you are using a non initialized matrix");
+    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
+    for(Index i = 1; i < innerDim; ++i)
+      res += lhs.coeff(row, i) * rhs.coeff(i, col);
+  }
+};
+
+/*******************************************
+*** Scalar path with inner vectorization ***
+*******************************************/
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet>
+struct etor_product_coeff_vectorized_unroller
+{
+  typedef typename Lhs::Index Index;
+  enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, typename Lhs::PacketScalar &pres)
+  {
+    etor_product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, innerDim, pres);
+    pres = padd(pres, pmul( lhs.template packet<Aligned>(row, UnrollingIndex) , rhs.template packet<Aligned>(UnrollingIndex, col) ));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet>
+struct etor_product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, typename Lhs::PacketScalar &pres)
+  {
+    pres = pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col));
+  }
+};
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
+struct etor_product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::PacketScalar Packet;
+  typedef typename Lhs::Index Index;
+  enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, RetScalar &res)
+  {
+    Packet pres;
+    etor_product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, innerDim, pres);
+    etor_product_coeff_impl<DefaultTraversal,UnrollingIndex,Lhs,Rhs,RetScalar>::run(row, col, lhs, rhs, innerDim, res);
+    res = predux(pres);
+  }
+};
+
+template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
+struct etor_product_coeff_vectorized_dyn_selector
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, typename Lhs::Scalar &res)
+  {
+    res = lhs.row(row).transpose().cwiseProduct(rhs.col(col)).sum();
+  }
+};
+
+// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
+// NOTE maybe they are now useless since we have a specialization for Block<Matrix>
+template<typename Lhs, typename Rhs, int RhsCols>
+struct etor_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, typename Lhs::Scalar &res)
+  {
+    res = lhs.transpose().cwiseProduct(rhs.col(col)).sum();
+  }
+};
+
+template<typename Lhs, typename Rhs, int LhsRows>
+struct etor_product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index /*col*/, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, typename Lhs::Scalar &res)
+  {
+    res = lhs.row(row).transpose().cwiseProduct(rhs).sum();
+  }
+};
+
+template<typename Lhs, typename Rhs>
+struct etor_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
+{
+  typedef typename Lhs::Index Index;
+  EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, typename Lhs::Scalar &res)
+  {
+    res = lhs.transpose().cwiseProduct(rhs).sum();
+  }
+};
+
+template<typename Lhs, typename Rhs, typename RetScalar>
+struct etor_product_coeff_impl<InnerVectorizedTraversal, Dynamic, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, typename Lhs::Scalar &res)
+  {
+    etor_product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, innerDim, res);
+  }
+};
+
+/*******************
+*** Packet path  ***
+*******************/
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
+  {
+    etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
+    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
+  }
+};
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
+  {
+    etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
+    res =  pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  {
+    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  {
+    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
+  {
+    eigen_assert(innerDim>0 && "you are using a non initialized matrix");
+    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
+    for(Index i = 1; i < innerDim; ++i)
+      res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
+  {
+    eigen_assert(innerDim>0 && "you are using a non initialized matrix");
+    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
+    for(Index i = 1; i < innerDim; ++i)
+      res =  pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PRODUCT_EVALUATORS_H
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h
@@ -28,12 +28,18 @@ struct functor_traits<scalar_random_op<Scalar> >

 /** \returns a random matrix expression
  *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  * 
  * The parameters \a rows and \a cols are the number of rows and of columns of
  * the returned matrix. Must be compatible with this MatrixBase type.
  *
+  * \not_reentrant
+  * 
  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
  * it is redundant to pass \a rows and \a cols as arguments, so Random() should be used
  * instead.
+  * 
  *
  * Example: \include MatrixBase_random_int_int.cpp
  * Output: \verbinclude MatrixBase_random_int_int.out
@@ -41,8 +47,10 @@ struct functor_traits<scalar_random_op<Scalar> >
  * This expression has the "evaluate before nesting" flag so that it will be evaluated into
  * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
  * behavior with expressions involving random matrices.
+  * 
+  * See DenseBase::NullaryExpr(Index, const CustomNullaryOp&) for an example using C++11 random generators.
  *
-  * \sa MatrixBase::setRandom(), MatrixBase::Random(Index), MatrixBase::Random()
+  * \sa DenseBase::setRandom(), DenseBase::Random(Index), DenseBase::Random()
  */
 template<typename Derived>
 inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
@@ -52,11 +60,15 @@ DenseBase<Derived>::Random(Index rows, Index cols)
 }

 /** \returns a random vector expression
+  *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
  *
  * The parameter \a size is the size of the returned vector.
  * Must be compatible with this MatrixBase type.
  *
  * \only_for_vectors
+  * \not_reentrant
  *
  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
  * it is redundant to pass \a size as argument, so Random() should be used
@@ -69,7 +81,7 @@ DenseBase<Derived>::Random(Index rows, Index cols)
  * a temporary vector whenever it is nested in a larger expression. This prevents unexpected
  * behavior with expressions involving random matrices.
  *
-  * \sa MatrixBase::setRandom(), MatrixBase::Random(Index,Index), MatrixBase::Random()
+  * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random()
  */
 template<typename Derived>
 inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
@@ -80,6 +92,9 @@ DenseBase<Derived>::Random(Index size)

 /** \returns a fixed-size random matrix or vector expression
  *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  * 
  * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
  * need to use the variants taking size arguments.
  *
@@ -89,8 +104,10 @@ DenseBase<Derived>::Random(Index size)
  * This expression has the "evaluate before nesting" flag so that it will be evaluated into
  * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
  * behavior with expressions involving random matrices.
+  * 
+  * \not_reentrant
  *
-  * \sa MatrixBase::setRandom(), MatrixBase::Random(Index,Index), MatrixBase::Random(Index)
+  * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random(Index)
  */
 template<typename Derived>
 inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
@@ -101,6 +118,11 @@ DenseBase<Derived>::Random()

 /** Sets all coefficients in this expression to random values.
  *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  * 
+  * \not_reentrant
+  * 
  * Example: \include MatrixBase_setRandom.cpp
  * Output: \verbinclude MatrixBase_setRandom.out
  *
@@ -114,12 +136,16 @@ inline Derived& DenseBase<Derived>::setRandom()

 /** Resizes to the given \a newSize, and sets all coefficients in this expression to random values.
  *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  * 
  * \only_for_vectors
+  * \not_reentrant
  *
  * Example: \include Matrix_setRandom_int.cpp
  * Output: \verbinclude Matrix_setRandom_int.out
  *
-  * \sa MatrixBase::setRandom(), setRandom(Index,Index), class CwiseNullaryOp, MatrixBase::Random()
+  * \sa DenseBase::setRandom(), setRandom(Index,Index), class CwiseNullaryOp, DenseBase::Random()
  */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
@@ -131,13 +157,18 @@ PlainObjectBase<Derived>::setRandom(Index newSize)

 /** Resizes to the given size, and sets all coefficients in this expression to random values.
  *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  *
+  * \not_reentrant
+  * 
  * \param nbRows the new number of rows
  * \param nbCols the new number of columns
  *
  * Example: \include Matrix_setRandom_int_int.cpp
  * Output: \verbinclude Matrix_setRandom_int_int.out
  *
-  * \sa MatrixBase::setRandom(), setRandom(Index), class CwiseNullaryOp, MatrixBase::Random()
+  * \sa DenseBase::setRandom(), setRandom(Index), class CwiseNullaryOp, DenseBase::Random()
  */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -82,6 +82,7 @@ struct redux_novec_unroller

  typedef typename Derived::Scalar Scalar;

+  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
  {
    return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
@@ -99,6 +100,7 @@ struct redux_novec_unroller<Func, Derived, Start, 1>

  typedef typename Derived::Scalar Scalar;

+  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&)
  {
    return mat.coeffByOuterInner(outer, inner);
@@ -112,6 +114,7 @@ template<typename Func, typename Derived, int Start>
 struct redux_novec_unroller<Func, Derived, Start, 0>
 {
  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC 
  static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
 };

@@ -170,6 +173,7 @@ struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
 {
  typedef typename Derived::Scalar Scalar;
  typedef typename Derived::Index Index;
+  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func)
  {
    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
@@ -203,7 +207,7 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
    const Index packetSize = packet_traits<Scalar>::size;
    const Index alignedStart = internal::first_aligned(mat);
    enum {
-      alignment = bool(Derived::Flags & DirectAccessBit) || bool(Derived::Flags & AlignedBit)
+      alignment = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) || bool(Derived::Flags & AlignedBit)
                ? Aligned : Unaligned
    };
    const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
@@ -299,10 +303,15 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
  static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func)
  {
    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
-    Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
-    if (VectorizedSize != Size)
-      res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
-    return res;
+    if (VectorizedSize > 0) {
+      Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
+      if (VectorizedSize != Size)
+        res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
+      return res;
+    }
+    else {
+      return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
+    }
  }
 };

--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -19,17 +19,17 @@ template<typename PlainObjectType, int Options = 0,
 /** \class Ref
  * \ingroup Core_Module
  *
-  * \brief A matrix or vector expression mapping an existing expressions
+  * \brief A matrix or vector expression mapping an existing expression
  *
  * \tparam PlainObjectType the equivalent matrix type of the mapped data
  * \tparam Options specifies whether the pointer is \c #Aligned, or \c #Unaligned.
  *                The default is \c #Unaligned.
  * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1),
-  *                   but accept a variable outer stride (leading dimension).
+  *                   but accepts a variable outer stride (leading dimension).
  *                   This can be overridden by specifying strides.
  *                   The type passed here must be a specialization of the Stride template, see examples below.
  *
-  * This class permits to write non template functions taking Eigen's object as parameters while limiting the number of copies.
+  * This class provides a way to write non-template functions taking Eigen objects as parameters while limiting the number of copies.
  * A Ref<> object can represent either a const expression or a l-value:
  * \code
  * // in-out argument:
@@ -39,10 +39,10 @@ template<typename PlainObjectType, int Options = 0,
  * void foo2(const Ref<const VectorXf>& x);
  * \endcode
  *
-  * In the in-out case, the input argument must satisfies the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered.
+  * In the in-out case, the input argument must satisfy the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered.
  * By default, a Ref<VectorXf> can reference any dense vector expression of float having a contiguous memory layout.
-  * Likewise, a Ref<MatrixXf> can reference any column major dense matrix expression of float whose column's elements are contiguously stored with
-  * the possibility to have a constant space inbetween each column, i.e.: the inner stride mmust be equal to 1, but the outer-stride (or leading dimension),
+  * Likewise, a Ref<MatrixXf> can reference any column-major dense matrix expression of float whose column's elements are contiguously stored with
+  * the possibility to have a constant space in-between each column, i.e. the inner stride must be equal to 1, but the outer stride (or leading dimension)
  * can be greater than the number of rows.
  *
  * In the const case, if the input expression does not match the above requirement, then it is evaluated into a temporary before being passed to the function.
@@ -58,15 +58,15 @@ template<typename PlainObjectType, int Options = 0,
  * foo2(A.col().segment(2,4)); // No temporary
  * \endcode
  *
-  * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameter.
+  * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameters.
  * Here is an example accepting an innerstride!=1:
  * \code
  * // in-out argument:
  * void foo3(Ref<VectorXf,0,InnerStride<> > x);
  * foo3(A.row());              // OK
  * \endcode
-  * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involved more
-  * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overloads internally calling a
+  * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involve more
+  * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overload internally calling a
  * template function, e.g.:
  * \code
  * // in the .h:
@@ -94,13 +94,14 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
  typedef _PlainObjectType PlainObjectType;
  typedef _StrideType StrideType;
  enum {
-    Options = _Options
+    Options = _Options,
+    Flags = traits<Map<_PlainObjectType, _Options, _StrideType> >::Flags | NestByRefBit
  };

  template<typename Derived> struct match {
    enum {
      HasDirectAccess = internal::has_direct_access<Derived>::ret,
-      StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
+      StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
      InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic)
                      || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
                      || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
@@ -111,7 +112,7 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
    };
    typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
  };
-
+  
 };

 template<typename Derived>
@@ -171,8 +172,12 @@ protected:
    }
    else
      ::new (static_cast<Base*>(this)) Base(expr.data(), expr.rows(), expr.cols());
-    ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(),
-                                 StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride());    
+    
+    if(Expression::IsVectorAtCompileTime && (!PlainObjectType::IsVectorAtCompileTime) && ((Expression::Flags&RowMajorBit)!=(PlainObjectType::Flags&RowMajorBit)))
+      ::new (&m_stride) StrideBase(expr.innerStride(), StrideType::InnerStrideAtCompileTime==0?0:1);
+    else
+      ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(),
+                                   StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride());    
  }

  StrideBase m_stride;
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@@ -57,10 +57,11 @@ template<typename Derived> class ReturnByValue
    EIGEN_DENSE_PUBLIC_INTERFACE(ReturnByValue)

    template<typename Dest>
+    EIGEN_DEVICE_FUNC
    inline void evalTo(Dest& dst) const
    { static_cast<const Derived*>(this)->evalTo(dst); }
-    inline Index rows() const { return static_cast<const Derived*>(this)->rows(); }
-    inline Index cols() const { return static_cast<const Derived*>(this)->cols(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return static_cast<const Derived*>(this)->rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return static_cast<const Derived*>(this)->cols(); }

 #ifndef EIGEN_PARSED_BY_DOXYGEN
 #define Unusable YOU_ARE_TRYING_TO_ACCESS_A_SINGLE_COEFFICIENT_IN_A_SPECIAL_EXPRESSION_WHERE_THAT_IS_NOT_ALLOWED_BECAUSE_THAT_WOULD_BE_INEFFICIENT
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -69,17 +69,23 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
    };
    typedef typename MatrixType::PlainObject PlainObject;

+    EIGEN_DEVICE_FUNC
    inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
    {}

+    EIGEN_DEVICE_FUNC
    inline Index rows() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC
    inline Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC
    inline Index outerStride() const { return m_matrix.outerStride(); }
+    EIGEN_DEVICE_FUNC
    inline Index innerStride() const { return m_matrix.innerStride(); }

    /** \sa MatrixBase::coeff()
      * \warning the coordinates must fit into the referenced triangular part
      */
+    EIGEN_DEVICE_FUNC
    inline Scalar coeff(Index row, Index col) const
    {
      Base::check_coordinates_internal(row, col);
@@ -89,6 +95,7 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
    /** \sa MatrixBase::coeffRef()
      * \warning the coordinates must fit into the referenced triangular part
      */
+    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index row, Index col)
    {
      Base::check_coordinates_internal(row, col);
@@ -96,13 +103,17 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
    }

    /** \internal */
+    EIGEN_DEVICE_FUNC
    const MatrixTypeNestedCleaned& _expression() const { return m_matrix; }

+    EIGEN_DEVICE_FUNC
    const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
+    EIGEN_DEVICE_FUNC
    MatrixTypeNestedCleaned& nestedExpression() { return *const_cast<MatrixTypeNestedCleaned*>(&m_matrix); }

    /** Efficient self-adjoint matrix times vector/matrix product */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    SelfadjointProductMatrix<MatrixType,Mode,false,OtherDerived,0,OtherDerived::IsVectorAtCompileTime>
    operator*(const MatrixBase<OtherDerived>& rhs) const
    {
@@ -113,6 +124,7 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView

    /** Efficient vector/matrix times self-adjoint matrix product */
    template<typename OtherDerived> friend
+    EIGEN_DEVICE_FUNC
    SelfadjointProductMatrix<OtherDerived,0,OtherDerived::IsVectorAtCompileTime,MatrixType,Mode,false>
    operator*(const MatrixBase<OtherDerived>& lhs, const SelfAdjointView& rhs)
    {
@@ -132,6 +144,7 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
      * \sa rankUpdate(const MatrixBase<DerivedU>&, Scalar)
      */
    template<typename DerivedU, typename DerivedV>
+    EIGEN_DEVICE_FUNC
    SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha = Scalar(1));

    /** Perform a symmetric rank K update of the selfadjoint matrix \c *this:
@@ -145,6 +158,7 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
      * \sa rankUpdate(const MatrixBase<DerivedU>&, const MatrixBase<DerivedV>&, Scalar)
      */
    template<typename DerivedU>
+    EIGEN_DEVICE_FUNC
    SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));

 /////////// Cholesky module ///////////
@@ -159,31 +173,10 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
    /** Return type of eigenvalues() */
    typedef Matrix<RealScalar, internal::traits<MatrixType>::ColsAtCompileTime, 1> EigenvaluesReturnType;

+    EIGEN_DEVICE_FUNC
    EigenvaluesReturnType eigenvalues() const;
+    EIGEN_DEVICE_FUNC
    RealScalar operatorNorm() const;
-    
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived>
-    SelfAdjointView& operator=(const MatrixBase<OtherDerived>& other)
-    {
-      enum {
-        OtherPart = UpLo == Upper ? StrictlyLower : StrictlyUpper
-      };
-      m_matrix.const_cast_derived().template triangularView<UpLo>() = other;
-      m_matrix.const_cast_derived().template triangularView<OtherPart>() = other.adjoint();
-      return *this;
-    }
-    template<typename OtherMatrixType, unsigned int OtherMode>
-    SelfAdjointView& operator=(const TriangularView<OtherMatrixType, OtherMode>& other)
-    {
-      enum {
-        OtherPart = UpLo == Upper ? StrictlyLower : StrictlyUpper
-      };
-      m_matrix.const_cast_derived().template triangularView<UpLo>() = other.toDenseMatrix();
-      m_matrix.const_cast_derived().template triangularView<OtherPart>() = other.toDenseMatrix().adjoint();
-      return *this;
-    }
-    #endif

  protected:
    MatrixTypeNested m_matrix;
@@ -209,6 +202,7 @@ struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Upper), U
    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
  };

+  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Upper), UnrollCount-1, ClearOpposite>::run(dst, src);
@@ -223,6 +217,7 @@ struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Upper), U
 template<typename Derived1, typename Derived2, bool ClearOpposite>
 struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Upper, 0, ClearOpposite>
 {
+  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &, const Derived2 &) {}
 };

@@ -234,6 +229,7 @@ struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Lower), U
    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
  };

+  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Lower), UnrollCount-1, ClearOpposite>::run(dst, src);
@@ -248,6 +244,7 @@ struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Lower), U
 template<typename Derived1, typename Derived2, bool ClearOpposite>
 struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Lower, 0, ClearOpposite>
 {
+  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &, const Derived2 &) {}
 };

@@ -255,6 +252,7 @@ template<typename Derived1, typename Derived2, bool ClearOpposite>
 struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Upper, Dynamic, ClearOpposite>
 {
  typedef typename Derived1::Index Index;
+  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    for(Index j = 0; j < dst.cols(); ++j)
@@ -272,6 +270,7 @@ struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Upper, Dyn
 template<typename Derived1, typename Derived2, bool ClearOpposite>
 struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Lower, Dynamic, ClearOpposite>
 {
+  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
  typedef typename Derived1::Index Index;
--- a/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h
@@ -35,7 +35,7 @@ struct traits<SelfCwiseBinaryOp<BinaryOp,Lhs,Rhs> >
  enum {
    // Note that it is still a good idea to preserve the DirectAccessBit
    // so that assign can correctly align the data.
-    Flags = traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> >::Flags | (Lhs::Flags&DirectAccessBit) | (Lhs::Flags&LvalueBit),
+    Flags = traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> >::Flags | (Lhs::Flags&AlignedBit) | (Lhs::Flags&DirectAccessBit) | (Lhs::Flags&LvalueBit),
    OuterStrideAtCompileTime = Lhs::OuterStrideAtCompileTime,
    InnerStrideAtCompileTime = Lhs::InnerStrideAtCompileTime
  };
@@ -52,21 +52,24 @@ template<typename BinaryOp, typename Lhs, typename Rhs> class SelfCwiseBinaryOp

    typedef typename internal::packet_traits<Scalar>::type Packet;

+    EIGEN_DEVICE_FUNC
    inline SelfCwiseBinaryOp(Lhs& xpr, const BinaryOp& func = BinaryOp()) : m_matrix(xpr), m_functor(func) {}

-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-    inline Index outerStride() const { return m_matrix.outerStride(); }
-    inline Index innerStride() const { return m_matrix.innerStride(); }
-    inline const Scalar* data() const { return m_matrix.data(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_matrix.outerStride(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_matrix.innerStride(); }
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_matrix.data(); }

    // note that this function is needed by assign to correctly align loads/stores
    // TODO make Assign use .data()
+    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index row, Index col)
    {
      EIGEN_STATIC_ASSERT_LVALUE(Lhs)
      return m_matrix.const_cast_derived().coeffRef(row, col);
    }
+    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index row, Index col) const
    {
      return m_matrix.coeffRef(row, col);
@@ -74,17 +77,20 @@ template<typename BinaryOp, typename Lhs, typename Rhs> class SelfCwiseBinaryOp

    // note that this function is needed by assign to correctly align loads/stores
    // TODO make Assign use .data()
+    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index index)
    {
      EIGEN_STATIC_ASSERT_LVALUE(Lhs)
      return m_matrix.const_cast_derived().coeffRef(index);
    }
+    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
      return m_matrix.const_cast_derived().coeffRef(index);
    }

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    void copyCoeff(Index row, Index col, const DenseBase<OtherDerived>& other)
    {
      OtherDerived& _other = other.const_cast_derived();
@@ -95,6 +101,7 @@ template<typename BinaryOp, typename Lhs, typename Rhs> class SelfCwiseBinaryOp
    }

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
    {
      OtherDerived& _other = other.const_cast_derived();
@@ -125,6 +132,7 @@ template<typename BinaryOp, typename Lhs, typename Rhs> class SelfCwiseBinaryOp
    // reimplement lazyAssign to handle complex *= real
    // see CwiseBinaryOp ctor for details
    template<typename RhsDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE SelfCwiseBinaryOp& lazyAssign(const DenseBase<RhsDerived>& rhs)
    {
      EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Lhs,RhsDerived)
@@ -144,17 +152,20 @@ template<typename BinaryOp, typename Lhs, typename Rhs> class SelfCwiseBinaryOp
    // overloaded to honor evaluation of special matrices
    // maybe another solution would be to not use SelfCwiseBinaryOp
    // at first...
+    EIGEN_DEVICE_FUNC
    SelfCwiseBinaryOp& operator=(const Rhs& _rhs)
    {
      typename internal::nested<Rhs>::type rhs(_rhs);
      return Base::operator=(rhs);
    }

+    EIGEN_DEVICE_FUNC
    Lhs& expression() const 
    { 
      return m_matrix;
    }

+    EIGEN_DEVICE_FUNC
    const BinaryOp& functor() const 
    { 
      return m_functor;
@@ -177,18 +188,30 @@ inline Derived& DenseBase<Derived>::operator*=(const Scalar& other)
  return derived();
 }

+template<typename Derived>
+inline Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
+{
+  typedef typename Derived::PlainObject PlainObject;
+  SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
+  tmp = PlainObject::Constant(rows(),cols(),other);
+  return derived();
+}
+
+template<typename Derived>
+inline Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
+{
+  typedef typename Derived::PlainObject PlainObject;
+  SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
+  tmp = PlainObject::Constant(rows(),cols(),other);
+  return derived();
+}
+
 template<typename Derived>
 inline Derived& DenseBase<Derived>::operator/=(const Scalar& other)
 {
-  typedef typename internal::conditional<NumTraits<Scalar>::IsInteger,
-                                        internal::scalar_quotient_op<Scalar>,
-                                        internal::scalar_product_op<Scalar> >::type BinOp;
  typedef typename Derived::PlainObject PlainObject;
-  SelfCwiseBinaryOp<BinOp, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
-  Scalar actual_other;
-  if(NumTraits<Scalar>::IsInteger)  actual_other = other;
-  else                              actual_other = Scalar(1)/other;
-  tmp = PlainObject::Constant(rows(),cols(), actual_other);
+  SelfCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
+  tmp = PlainObject::Constant(rows(),cols(), other);
  return derived();
 }

--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -17,16 +17,38 @@ namespace internal {
 template<typename ExpressionType, typename Scalar>
 inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& scale, Scalar& invScale)
 {
-  Scalar max = bl.cwiseAbs().maxCoeff();
-  if (max>scale)
+  using std::max;
+  Scalar maxCoeff = bl.cwiseAbs().maxCoeff();
+  
+  if(maxCoeff>scale)
  {
-    ssq = ssq * numext::abs2(scale/max);
-    scale = max;
-    invScale = Scalar(1)/scale;
+    ssq = ssq * numext::abs2(scale/maxCoeff);
+    Scalar tmp = Scalar(1)/maxCoeff;
+    if(tmp > NumTraits<Scalar>::highest())
+    {
+      invScale = NumTraits<Scalar>::highest();
+      scale = Scalar(1)/invScale;
+    }
+    else if(maxCoeff>NumTraits<Scalar>::highest()) // we got a INF
+    {
+      invScale = Scalar(1);
+      scale = maxCoeff;
+    }
+    else
+    {
+      scale = maxCoeff;
+      invScale = tmp;
+    }
  }
-  // TODO if the max is much much smaller than the current scale,
+  else if(maxCoeff!=maxCoeff) // we got a NaN
+  {
+    scale = maxCoeff;
+  }
+  
+  // TODO if the maxCoeff is much much smaller than the current scale,
  // then we can neglect this sub vector
-  ssq += (bl*invScale).squaredNorm();
+  if(scale>Scalar(0)) // if scale==0, then bl is 0 
+    ssq += (bl*invScale).squaredNorm();
 }

 template<typename Derived>
@@ -36,13 +58,13 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
  typedef typename Derived::RealScalar RealScalar;  
  typedef typename Derived::Index Index;
  using std::pow;
-  using std::min;
-  using std::max;
+  EIGEN_USING_STD_MATH(min);
+  EIGEN_USING_STD_MATH(max);
  using std::sqrt;
  using std::abs;
  const Derived& vec(_vec.derived());
  static bool initialized = false;
-  static RealScalar b1, b2, s1m, s2m, overfl, rbig, relerr;
+  static RealScalar b1, b2, s1m, s2m, rbig, relerr;
  if(!initialized)
  {
    int ibeta, it, iemin, iemax, iexp;
@@ -71,7 +93,6 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
    iexp  = - ((iemax+it)/2);
    s2m   = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // scaling factor for upper range

-    overfl  = rbig*s2m;                                             // overflow boundary for abig
    eps     = RealScalar(pow(double(ibeta), 1-it));
    relerr  = sqrt(eps);                                            // tolerance for neglecting asml
    initialized = true;
@@ -88,13 +109,13 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
    else if(ax < b1) asml += numext::abs2(ax*s1m);
    else             amed += numext::abs2(ax);
  }
+  if(amed!=amed)
+    return amed;  // we got a NaN
  if(abig > RealScalar(0))
  {
    abig = sqrt(abig);
-    if(abig > overfl)
-    {
-      return rbig;
-    }
+    if(abig > rbig) // overflow, or *this contains INF values
+      return abig;  // return INF
    if(amed > RealScalar(0))
    {
      abig = abig/s2m;
@@ -139,7 +160,7 @@ template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::stableNorm() const
 {
-  using std::min;
+  EIGEN_USING_STD_MATH(min);
  using std::sqrt;
  const Index blockSize = 4096;
  RealScalar scale(0);
--- a/Eigen/src/Core/Stride.h
+++ b/Eigen/src/Core/Stride.h
@@ -51,6 +51,7 @@ class Stride
    };

    /** Default constructor, for use when strides are fixed at compile time */
+    EIGEN_DEVICE_FUNC
    Stride()
      : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime)
    {
@@ -58,6 +59,7 @@ class Stride
    }

    /** Constructor allowing to pass the strides at runtime */
+    EIGEN_DEVICE_FUNC
    Stride(Index outerStride, Index innerStride)
      : m_outer(outerStride), m_inner(innerStride)
    {
@@ -65,13 +67,16 @@ class Stride
    }

    /** Copy constructor */
+    EIGEN_DEVICE_FUNC
    Stride(const Stride& other)
      : m_outer(other.outer()), m_inner(other.inner())
    {}

    /** \returns the outer stride */
+    EIGEN_DEVICE_FUNC
    inline Index outer() const { return m_outer.value(); }
    /** \returns the inner stride */
+    EIGEN_DEVICE_FUNC
    inline Index inner() const { return m_inner.value(); }

  protected:
@@ -87,8 +92,8 @@ class InnerStride : public Stride<0, Value>
    typedef Stride<0, Value> Base;
  public:
    typedef DenseIndex Index;
-    InnerStride() : Base() {}
-    InnerStride(Index v) : Base(0, v) {}
+    EIGEN_DEVICE_FUNC InnerStride() : Base() {}
+    EIGEN_DEVICE_FUNC InnerStride(Index v) : Base(0, v) {}
 };

 /** \brief Convenience specialization of Stride to specify only an outer stride
@@ -99,8 +104,8 @@ class OuterStride : public Stride<Value, 0>
    typedef Stride<Value, 0> Base;
  public:
    typedef DenseIndex Index;
-    OuterStride() : Base() {}
-    OuterStride(Index v) : Base(v,0) {}
+    EIGEN_DEVICE_FUNC OuterStride() : Base() {}
+    EIGEN_DEVICE_FUNC OuterStride(Index v) : Base(v,0) {}
 };

 } // end namespace Eigen
--- a/Eigen/src/Core/Swap.h
+++ b/Eigen/src/Core/Swap.h
@@ -33,11 +33,16 @@ template<typename ExpressionType> class SwapWrapper
    EIGEN_DENSE_PUBLIC_INTERFACE(SwapWrapper)
    typedef typename internal::packet_traits<Scalar>::type Packet;

+    EIGEN_DEVICE_FUNC
    inline SwapWrapper(ExpressionType& xpr) : m_expression(xpr) {}

+    EIGEN_DEVICE_FUNC
    inline Index rows() const { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC
    inline Index cols() const { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC
    inline Index outerStride() const { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC
    inline Index innerStride() const { return m_expression.innerStride(); }
    
    typedef typename internal::conditional<
@@ -46,30 +51,37 @@ template<typename ExpressionType> class SwapWrapper
                       const Scalar
                     >::type ScalarWithConstIfNotLvalue;
                     
+    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
+    EIGEN_DEVICE_FUNC
    inline const Scalar* data() const { return m_expression.data(); }

+    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index rowId, Index colId)
    {
      return m_expression.const_cast_derived().coeffRef(rowId, colId);
    }

+    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index index)
    {
      return m_expression.const_cast_derived().coeffRef(index);
    }

+    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index rowId, Index colId) const
    {
      return m_expression.coeffRef(rowId, colId);
    }

+    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index index) const
    {
      return m_expression.coeffRef(index);
    }

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    void copyCoeff(Index rowId, Index colId, const DenseBase<OtherDerived>& other)
    {
      OtherDerived& _other = other.const_cast_derived();
@@ -81,6 +93,7 @@ template<typename ExpressionType> class SwapWrapper
    }

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
    {
      OtherDerived& _other = other.const_cast_derived();
@@ -115,6 +128,7 @@ template<typename ExpressionType> class SwapWrapper
      _other.template writePacket<LoadMode>(index, tmp);
    }

+    EIGEN_DEVICE_FUNC
    ExpressionType& expression() const { return m_expression; }

  protected:
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -62,18 +62,21 @@ template<typename MatrixType> class Transpose
    typedef typename TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(Transpose)

+    EIGEN_DEVICE_FUNC
    inline Transpose(MatrixType& a_matrix) : m_matrix(a_matrix) {}

    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)

-    inline Index rows() const { return m_matrix.cols(); }
-    inline Index cols() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.rows(); }

    /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC
    const typename internal::remove_all<typename MatrixType::Nested>::type&
    nestedExpression() const { return m_matrix; }

    /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC
    typename internal::remove_all<typename MatrixType::Nested>::type&
    nestedExpression() { return m_matrix.const_cast_derived(); }

@@ -106,8 +109,8 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
    EIGEN_DENSE_PUBLIC_INTERFACE(Transpose<MatrixType>)
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl)

-    inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
-    inline Index outerStride() const { return derived().nestedExpression().outerStride(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return derived().nestedExpression().outerStride(); }

    typedef typename internal::conditional<
                       internal::is_lvalue<MatrixType>::value,
@@ -118,33 +121,39 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
    inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
    inline const Scalar* data() const { return derived().nestedExpression().data(); }

+    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue& coeffRef(Index rowId, Index colId)
    {
      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
      return derived().nestedExpression().const_cast_derived().coeffRef(colId, rowId);
    }

+    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue& coeffRef(Index index)
    {
      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
      return derived().nestedExpression().const_cast_derived().coeffRef(index);
    }

+    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
      return derived().nestedExpression().coeffRef(colId, rowId);
    }

+    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
      return derived().nestedExpression().coeffRef(index);
    }

+    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index rowId, Index colId) const
    {
      return derived().nestedExpression().coeff(colId, rowId);
    }

+    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index index) const
    {
      return derived().nestedExpression().coeff(index);
@@ -284,7 +293,8 @@ struct inplace_transpose_selector<MatrixType,false> { // non square matrix
  * Notice however that this method is only useful if you want to replace a matrix by its own transpose.
  * If you just need the transpose of a matrix, use transpose().
  *
-  * \note if the matrix is not square, then \c *this must be a resizable matrix.
+  * \note if the matrix is not square, then \c *this must be a resizable matrix. 
+  * This excludes (non-square) fixed-size matrices, block-expressions and maps.
  *
  * \sa transpose(), adjoint(), adjointInPlace() */
 template<typename Derived>
@@ -315,6 +325,7 @@ inline void DenseBase<Derived>::transposeInPlace()
  * If you just need the adjoint of a matrix, use adjoint().
  *
  * \note if the matrix is not square, then \c *this must be a resizable matrix.
+  * This excludes (non-square) fixed-size matrices, block-expressions and maps.
  *
  * \sa transpose(), adjoint(), transposeInPlace() */
 template<typename Derived>
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@@ -53,7 +53,8 @@ class TranspositionsBase
  public:

    typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
+    typedef typename IndicesType::Scalar StorageIndexType;
+    typedef typename IndicesType::Index  Index;

    Derived& derived() { return *static_cast<Derived*>(this); }
    const Derived& derived() const { return *static_cast<const Derived*>(this); }
@@ -81,17 +82,17 @@ class TranspositionsBase
    inline Index size() const { return indices().size(); }

    /** Direct access to the underlying index vector */
-    inline const Index& coeff(Index i) const { return indices().coeff(i); }
+    inline const StorageIndexType& coeff(Index i) const { return indices().coeff(i); }
    /** Direct access to the underlying index vector */
-    inline Index& coeffRef(Index i) { return indices().coeffRef(i); }
+    inline StorageIndexType& coeffRef(Index i) { return indices().coeffRef(i); }
    /** Direct access to the underlying index vector */
-    inline const Index& operator()(Index i) const { return indices()(i); }
+    inline const StorageIndexType& operator()(Index i) const { return indices()(i); }
    /** Direct access to the underlying index vector */
-    inline Index& operator()(Index i) { return indices()(i); }
+    inline StorageIndexType& operator()(Index i) { return indices()(i); }
    /** Direct access to the underlying index vector */
-    inline const Index& operator[](Index i) const { return indices()(i); }
+    inline const StorageIndexType& operator[](Index i) const { return indices()(i); }
    /** Direct access to the underlying index vector */
-    inline Index& operator[](Index i) { return indices()(i); }
+    inline StorageIndexType& operator[](Index i) { return indices()(i); }

    /** const version of indices(). */
    const IndicesType& indices() const { return derived().indices(); }
@@ -99,7 +100,7 @@ class TranspositionsBase
    IndicesType& indices() { return derived().indices(); }

    /** Resizes to given size. */
-    inline void resize(int newSize)
+    inline void resize(Index newSize)
    {
      indices().resize(newSize);
    }
@@ -107,7 +108,7 @@ class TranspositionsBase
    /** Sets \c *this to represents an identity transformation */
    void setIdentity()
    {
-      for(int i = 0; i < indices().size(); ++i)
+      for(StorageIndexType i = 0; i < indices().size(); ++i)
        coeffRef(i) = i;
    }

@@ -144,23 +145,26 @@ class TranspositionsBase
 };

 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-struct traits<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndexType>
+struct traits<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndexType> >
 {
-  typedef IndexType Index;
-  typedef Matrix<Index, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+  typedef Matrix<_StorageIndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+  typedef typename IndicesType::Index Index;
+  typedef _StorageIndexType StorageIndexType;
 };
 }

-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndexType>
+class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndexType> >
 {
    typedef internal::traits<Transpositions> Traits;
  public:

    typedef TranspositionsBase<Transpositions> Base;
    typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
+    typedef typename IndicesType::Scalar StorageIndexType;
+    typedef typename IndicesType::Index  Index;
+    

    inline Transpositions() {}

@@ -215,30 +219,32 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim


 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
-struct traits<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,_PacketAccess> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndexType, int _PacketAccess>
+struct traits<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndexType>,_PacketAccess> >
 {
-  typedef IndexType Index;
-  typedef Map<const Matrix<Index,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1>, _PacketAccess> IndicesType;
+  typedef Map<const Matrix<_StorageIndexType,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1>, _PacketAccess> IndicesType;
+  typedef typename IndicesType::Index Index;
+  typedef _StorageIndexType StorageIndexType;
 };
 }

-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int PacketAccess>
-class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,PacketAccess>
- : public TranspositionsBase<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,PacketAccess> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndexType, int PacketAccess>
+class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndexType>,PacketAccess>
+ : public TranspositionsBase<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndexType>,PacketAccess> >
 {
    typedef internal::traits<Map> Traits;
  public:

    typedef TranspositionsBase<Map> Base;
    typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
+    typedef typename IndicesType::Scalar StorageIndexType;
+    typedef typename IndicesType::Index  Index;

-    inline Map(const Index* indicesPtr)
+    inline Map(const StorageIndexType* indicesPtr)
      : m_indices(indicesPtr)
    {}

-    inline Map(const Index* indicesPtr, Index size)
+    inline Map(const StorageIndexType* indicesPtr, Index size)
      : m_indices(indicesPtr,size)
    {}

@@ -275,7 +281,8 @@ namespace internal {
 template<typename _IndicesType>
 struct traits<TranspositionsWrapper<_IndicesType> >
 {
-  typedef typename _IndicesType::Scalar Index;
+  typedef typename _IndicesType::Scalar StorageIndexType;
+  typedef typename _IndicesType::Index Index;
  typedef _IndicesType IndicesType;
 };
 }
@@ -289,7 +296,8 @@ class TranspositionsWrapper

    typedef TranspositionsBase<TranspositionsWrapper> Base;
    typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
+    typedef typename IndicesType::Scalar StorageIndexType;
+    typedef typename IndicesType::Index  Index;

    inline TranspositionsWrapper(IndicesType& a_indices)
      : m_indices(a_indices)
@@ -363,24 +371,25 @@ struct transposition_matrix_product_retval
 {
    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
    typedef typename TranspositionType::Index Index;
+    typedef typename TranspositionType::StorageIndexType StorageIndexType;

    transposition_matrix_product_retval(const TranspositionType& tr, const MatrixType& matrix)
      : m_transpositions(tr), m_matrix(matrix)
    {}

-    inline int rows() const { return m_matrix.rows(); }
-    inline int cols() const { return m_matrix.cols(); }
+    inline Index rows() const { return m_matrix.rows(); }
+    inline Index cols() const { return m_matrix.cols(); }

    template<typename Dest> inline void evalTo(Dest& dst) const
    {
-      const int size = m_transpositions.size();
-      Index j = 0;
+      const Index size = m_transpositions.size();
+      StorageIndexType j = 0;

      if(!(is_same<MatrixTypeNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_matrix)))
        dst = m_matrix;

-      for(int k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k)
-        if((j=m_transpositions.coeff(k))!=k)
+      for(Index k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k)
+        if(Index(j=m_transpositions.coeff(k))!=k)
        {
          if(Side==OnTheLeft)
            dst.row(k).swap(dst.row(j));
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -44,29 +44,39 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
    typedef typename internal::traits<Derived>::DenseMatrixType DenseMatrixType;
    typedef DenseMatrixType DenseType;

+    EIGEN_DEVICE_FUNC
    inline TriangularBase() { eigen_assert(!((Mode&UnitDiag) && (Mode&ZeroDiag))); }

+    EIGEN_DEVICE_FUNC
    inline Index rows() const { return derived().rows(); }
+    EIGEN_DEVICE_FUNC
    inline Index cols() const { return derived().cols(); }
+    EIGEN_DEVICE_FUNC
    inline Index outerStride() const { return derived().outerStride(); }
+    EIGEN_DEVICE_FUNC
    inline Index innerStride() const { return derived().innerStride(); }

+    EIGEN_DEVICE_FUNC
    inline Scalar coeff(Index row, Index col) const  { return derived().coeff(row,col); }
+    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index row, Index col) { return derived().coeffRef(row,col); }

    /** \see MatrixBase::copyCoeff(row,col)
      */
    template<typename Other>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void copyCoeff(Index row, Index col, Other& other)
    {
      derived().coeffRef(row, col) = other.coeff(row, col);
    }

+    EIGEN_DEVICE_FUNC
    inline Scalar operator()(Index row, Index col) const
    {
      check_coordinates(row, col);
      return coeff(row,col);
    }
+    EIGEN_DEVICE_FUNC
    inline Scalar& operator()(Index row, Index col)
    {
      check_coordinates(row, col);
@@ -74,15 +84,20 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
    }

    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    EIGEN_DEVICE_FUNC
    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    EIGEN_DEVICE_FUNC
    inline Derived& derived() { return *static_cast<Derived*>(this); }
    #endif // not EIGEN_PARSED_BY_DOXYGEN

    template<typename DenseDerived>
+    EIGEN_DEVICE_FUNC
    void evalTo(MatrixBase<DenseDerived> &other) const;
    template<typename DenseDerived>
+    EIGEN_DEVICE_FUNC
    void evalToLazy(MatrixBase<DenseDerived> &other) const;

+    EIGEN_DEVICE_FUNC
    DenseMatrixType toDenseMatrix() const
    {
      DenseMatrixType res(rows(), cols());
@@ -189,36 +204,52 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
                    | (Mode & (ZeroDiag))
    };

+    EIGEN_DEVICE_FUNC
    inline TriangularView(const MatrixType& matrix) : m_matrix(matrix)
    {}

+    EIGEN_DEVICE_FUNC
    inline Index rows() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC
    inline Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC
    inline Index outerStride() const { return m_matrix.outerStride(); }
+    EIGEN_DEVICE_FUNC
    inline Index innerStride() const { return m_matrix.innerStride(); }

-    /** \sa MatrixBase::operator+=() */
-    template<typename Other> TriangularView&  operator+=(const DenseBase<Other>& other) { return *this = m_matrix + other.derived(); }
+    /** \sa MatrixBase::operator+=() */    
+    template<typename Other>
+    EIGEN_DEVICE_FUNC
+    TriangularView&  operator+=(const DenseBase<Other>& other) { return *this = m_matrix + other.derived(); }
    /** \sa MatrixBase::operator-=() */
-    template<typename Other> TriangularView&  operator-=(const DenseBase<Other>& other) { return *this = m_matrix - other.derived(); }
+    template<typename Other>
+    EIGEN_DEVICE_FUNC
+    TriangularView&  operator-=(const DenseBase<Other>& other) { return *this = m_matrix - other.derived(); }
    /** \sa MatrixBase::operator*=() */
+    EIGEN_DEVICE_FUNC
    TriangularView&  operator*=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = m_matrix * other; }
    /** \sa MatrixBase::operator/=() */
+    EIGEN_DEVICE_FUNC
    TriangularView&  operator/=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = m_matrix / other; }

    /** \sa MatrixBase::fill() */
+    EIGEN_DEVICE_FUNC
    void fill(const Scalar& value) { setConstant(value); }
    /** \sa MatrixBase::setConstant() */
+    EIGEN_DEVICE_FUNC
    TriangularView& setConstant(const Scalar& value)
    { return *this = MatrixType::Constant(rows(), cols(), value); }
    /** \sa MatrixBase::setZero() */
+    EIGEN_DEVICE_FUNC
    TriangularView& setZero() { return setConstant(Scalar(0)); }
    /** \sa MatrixBase::setOnes() */
+    EIGEN_DEVICE_FUNC
    TriangularView& setOnes() { return setConstant(Scalar(1)); }

    /** \sa MatrixBase::coeff()
      * \warning the coordinates must fit into the referenced triangular part
      */
+    EIGEN_DEVICE_FUNC
    inline Scalar coeff(Index row, Index col) const
    {
      Base::check_coordinates_internal(row, col);
@@ -228,49 +259,62 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
    /** \sa MatrixBase::coeffRef()
      * \warning the coordinates must fit into the referenced triangular part
      */
+    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index row, Index col)
    {
      Base::check_coordinates_internal(row, col);
      return m_matrix.const_cast_derived().coeffRef(row, col);
    }

+    EIGEN_DEVICE_FUNC
    const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
+    EIGEN_DEVICE_FUNC
    MatrixTypeNestedCleaned& nestedExpression() { return *const_cast<MatrixTypeNestedCleaned*>(&m_matrix); }

    /** Assigns a triangular matrix to a triangular part of a dense matrix */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    TriangularView& operator=(const TriangularBase<OtherDerived>& other);

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    TriangularView& operator=(const MatrixBase<OtherDerived>& other);

+    EIGEN_DEVICE_FUNC
    TriangularView& operator=(const TriangularView& other)
    { return *this = other.nestedExpression(); }

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    void lazyAssign(const TriangularBase<OtherDerived>& other);

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    void lazyAssign(const MatrixBase<OtherDerived>& other);

    /** \sa MatrixBase::conjugate() */
+    EIGEN_DEVICE_FUNC
    inline TriangularView<MatrixConjugateReturnType,Mode> conjugate()
    { return m_matrix.conjugate(); }
    /** \sa MatrixBase::conjugate() const */
+    EIGEN_DEVICE_FUNC
    inline const TriangularView<MatrixConjugateReturnType,Mode> conjugate() const
    { return m_matrix.conjugate(); }

    /** \sa MatrixBase::adjoint() const */
+    EIGEN_DEVICE_FUNC
    inline const TriangularView<const typename MatrixType::AdjointReturnType,TransposeMode> adjoint() const
    { return m_matrix.adjoint(); }

    /** \sa MatrixBase::transpose() */
+    EIGEN_DEVICE_FUNC
    inline TriangularView<Transpose<MatrixType>,TransposeMode> transpose()
    {
      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
      return m_matrix.const_cast_derived().transpose();
    }
    /** \sa MatrixBase::transpose() const */
+    EIGEN_DEVICE_FUNC
    inline const TriangularView<Transpose<MatrixType>,TransposeMode> transpose() const
    {
      return m_matrix.transpose();
@@ -278,74 +322,53 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView

    /** Efficient triangular matrix times vector/matrix product */
    template<typename OtherDerived>
-    TriangularProduct<Mode,true,MatrixType,false,OtherDerived, OtherDerived::IsVectorAtCompileTime>
+    EIGEN_DEVICE_FUNC
+    TriangularProduct<Mode, true, MatrixType, false, OtherDerived, OtherDerived::ColsAtCompileTime==1>
    operator*(const MatrixBase<OtherDerived>& rhs) const
    {
      return TriangularProduct
-              <Mode,true,MatrixType,false,OtherDerived,OtherDerived::IsVectorAtCompileTime>
+              <Mode, true, MatrixType, false, OtherDerived, OtherDerived::ColsAtCompileTime==1>
              (m_matrix, rhs.derived());
    }

    /** Efficient vector/matrix times triangular matrix product */
    template<typename OtherDerived> friend
-    TriangularProduct<Mode,false,OtherDerived,OtherDerived::IsVectorAtCompileTime,MatrixType,false>
+    EIGEN_DEVICE_FUNC
+    TriangularProduct<Mode, false, OtherDerived, OtherDerived::RowsAtCompileTime==1, MatrixType, false>
    operator*(const MatrixBase<OtherDerived>& lhs, const TriangularView& rhs)
    {
      return TriangularProduct
-              <Mode,false,OtherDerived,OtherDerived::IsVectorAtCompileTime,MatrixType,false>
+              <Mode, false, OtherDerived, OtherDerived::RowsAtCompileTime==1, MatrixType, false>
              (lhs.derived(),rhs.m_matrix);
    }

-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived>
-    struct eigen2_product_return_type
-    {
-      typedef typename TriangularView<MatrixType,Mode>::DenseMatrixType DenseMatrixType;
-      typedef typename OtherDerived::PlainObject::DenseType OtherPlainObject;
-      typedef typename ProductReturnType<DenseMatrixType, OtherPlainObject>::Type ProdRetType;
-      typedef typename ProdRetType::PlainObject type;
-    };
-    template<typename OtherDerived>
-    const typename eigen2_product_return_type<OtherDerived>::type
-    operator*(const EigenBase<OtherDerived>& rhs) const
-    {
-      typename OtherDerived::PlainObject::DenseType rhsPlainObject;
-      rhs.evalTo(rhsPlainObject);
-      return this->toDenseMatrix() * rhsPlainObject;
-    }
-    template<typename OtherMatrixType>
-    bool isApprox(const TriangularView<OtherMatrixType, Mode>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
-    {
-      return this->toDenseMatrix().isApprox(other.toDenseMatrix(), precision);
-    }
-    template<typename OtherDerived>
-    bool isApprox(const MatrixBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
-    {
-      return this->toDenseMatrix().isApprox(other, precision);
-    }
-    #endif // EIGEN2_SUPPORT
-
    template<int Side, typename Other>
+    EIGEN_DEVICE_FUNC
    inline const internal::triangular_solve_retval<Side,TriangularView, Other>
    solve(const MatrixBase<Other>& other) const;

    template<int Side, typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    void solveInPlace(const MatrixBase<OtherDerived>& other) const;

    template<typename Other>
+    EIGEN_DEVICE_FUNC
    inline const internal::triangular_solve_retval<OnTheLeft,TriangularView, Other> 
    solve(const MatrixBase<Other>& other) const
    { return solve<OnTheLeft>(other); }

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    void solveInPlace(const MatrixBase<OtherDerived>& other) const
    { return solveInPlace<OnTheLeft>(other); }

+    EIGEN_DEVICE_FUNC
    const SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView() const
    {
      EIGEN_STATIC_ASSERT((Mode&UnitDiag)==0,PROGRAMMING_ERROR);
      return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
    }
+    EIGEN_DEVICE_FUNC
    SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView()
    {
      EIGEN_STATIC_ASSERT((Mode&UnitDiag)==0,PROGRAMMING_ERROR);
@@ -353,18 +376,21 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
    }

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    void swap(TriangularBase<OtherDerived> const & other)
    {
      TriangularView<SwapWrapper<MatrixType>,Mode>(const_cast<MatrixType&>(m_matrix)).lazyAssign(other.derived());
    }

    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    void swap(MatrixBase<OtherDerived> const & other)
    {
      SwapWrapper<MatrixType> swaper(const_cast<MatrixType&>(m_matrix));
      TriangularView<SwapWrapper<MatrixType>,Mode>(swaper).lazyAssign(other.derived());
    }

+    EIGEN_DEVICE_FUNC
    Scalar determinant() const
    {
      if (Mode & UnitDiag)
@@ -377,6 +403,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
    
    // TODO simplify the following:
    template<typename ProductDerived, typename Lhs, typename Rhs>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE TriangularView& operator=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
    {
      setZero();
@@ -384,12 +411,14 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
    }
    
    template<typename ProductDerived, typename Lhs, typename Rhs>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE TriangularView& operator+=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
    {
      return assignProduct(other,1);
    }
    
    template<typename ProductDerived, typename Lhs, typename Rhs>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE TriangularView& operator-=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
    {
      return assignProduct(other,-1);
@@ -397,6 +426,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
    
    
    template<typename ProductDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE TriangularView& operator=(const ScaledProduct<ProductDerived>& other)
    {
      setZero();
@@ -404,12 +434,14 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
    }
    
    template<typename ProductDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE TriangularView& operator+=(const ScaledProduct<ProductDerived>& other)
    {
      return assignProduct(other,other.alpha());
    }
    
    template<typename ProductDerived>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE TriangularView& operator-=(const ScaledProduct<ProductDerived>& other)
    {
      return assignProduct(other,-other.alpha());
@@ -418,6 +450,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
  protected:
    
    template<typename ProductDerived, typename Lhs, typename Rhs>
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE TriangularView& assignProduct(const ProductBase<ProductDerived, Lhs,Rhs>& prod, const Scalar& alpha);

    MatrixTypeNested m_matrix;
@@ -439,6 +472,7 @@ struct triangular_assignment_selector
  
  typedef typename Derived1::Scalar Scalar;

+  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    triangular_assignment_selector<Derived1, Derived2, Mode, UnrollCount-1, ClearOpposite>::run(dst, src);
@@ -467,6 +501,7 @@ struct triangular_assignment_selector
 template<typename Derived1, typename Derived2, unsigned int Mode, bool ClearOpposite>
 struct triangular_assignment_selector<Derived1, Derived2, Mode, 0, ClearOpposite>
 {
+  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &, const Derived2 &) {}
 };

@@ -475,6 +510,7 @@ struct triangular_assignment_selector<Derived1, Derived2, Upper, Dynamic, ClearO
 {
  typedef typename Derived1::Index Index;
  typedef typename Derived1::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    for(Index j = 0; j < dst.cols(); ++j)
@@ -493,6 +529,7 @@ template<typename Derived1, typename Derived2, bool ClearOpposite>
 struct triangular_assignment_selector<Derived1, Derived2, Lower, Dynamic, ClearOpposite>
 {
  typedef typename Derived1::Index Index;
+  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    for(Index j = 0; j < dst.cols(); ++j)
@@ -512,6 +549,7 @@ struct triangular_assignment_selector<Derived1, Derived2, StrictlyUpper, Dynamic
 {
  typedef typename Derived1::Index Index;
  typedef typename Derived1::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    for(Index j = 0; j < dst.cols(); ++j)
@@ -530,6 +568,7 @@ template<typename Derived1, typename Derived2, bool ClearOpposite>
 struct triangular_assignment_selector<Derived1, Derived2, StrictlyLower, Dynamic, ClearOpposite>
 {
  typedef typename Derived1::Index Index;
+  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    for(Index j = 0; j < dst.cols(); ++j)
@@ -548,6 +587,7 @@ template<typename Derived1, typename Derived2, bool ClearOpposite>
 struct triangular_assignment_selector<Derived1, Derived2, UnitUpper, Dynamic, ClearOpposite>
 {
  typedef typename Derived1::Index Index;
+  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    for(Index j = 0; j < dst.cols(); ++j)
@@ -568,6 +608,7 @@ template<typename Derived1, typename Derived2, bool ClearOpposite>
 struct triangular_assignment_selector<Derived1, Derived2, UnitLower, Dynamic, ClearOpposite>
 {
  typedef typename Derived1::Index Index;
+  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    for(Index j = 0; j < dst.cols(); ++j)
@@ -710,41 +751,6 @@ void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
 * Implementation of MatrixBase methods
 ***************************************************************************/

-#ifdef EIGEN2_SUPPORT
-
-// implementation of part<>(), including the SelfAdjoint case.
-
-namespace internal {
-template<typename MatrixType, unsigned int Mode>
-struct eigen2_part_return_type
-{
-  typedef TriangularView<MatrixType, Mode> type;
-};
-
-template<typename MatrixType>
-struct eigen2_part_return_type<MatrixType, SelfAdjoint>
-{
-  typedef SelfAdjointView<MatrixType, Upper> type;
-};
-}
-
-/** \deprecated use MatrixBase::triangularView() */
-template<typename Derived>
-template<unsigned int Mode>
-const typename internal::eigen2_part_return_type<Derived, Mode>::type MatrixBase<Derived>::part() const
-{
-  return derived();
-}
-
-/** \deprecated use MatrixBase::triangularView() */
-template<typename Derived>
-template<unsigned int Mode>
-typename internal::eigen2_part_return_type<Derived, Mode>::type MatrixBase<Derived>::part()
-{
-  return derived();
-}
-#endif
-
 /**
  * \returns an expression of a triangular view extracted from the current matrix
  *
--- a/Eigen/src/Core/VectorBlock.h
+++ b/Eigen/src/Core/VectorBlock.h
@@ -72,6 +72,7 @@ template<typename VectorType, int Size> class VectorBlock

    /** Dynamic-size constructor
      */
+    EIGEN_DEVICE_FUNC
    inline VectorBlock(VectorType& vector, Index start, Index size)
      : Base(vector,
             IsColVector ? start : 0, IsColVector ? 0 : start,
@@ -82,6 +83,7 @@ template<typename VectorType, int Size> class VectorBlock

    /** Fixed-size constructor
      */
+    EIGEN_DEVICE_FUNC
    inline VectorBlock(VectorType& vector, Index start)
      : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start)
    {
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -50,7 +50,7 @@ struct traits<PartialReduxExpr<MatrixType, MemberOp, Direction> >
    MaxColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::MaxColsAtCompileTime,
    Flags0 = (unsigned int)_MatrixTypeNested::Flags & HereditaryBits,
    Flags = (Flags0 & ~RowMajorBit) | (RowsAtCompileTime == 1 ? RowMajorBit : 0),
-    TraversalSize = Direction==Vertical ? RowsAtCompileTime : ColsAtCompileTime
+    TraversalSize = Direction==Vertical ? MatrixType::RowsAtCompileTime :  MatrixType::ColsAtCompileTime
  };
  #if EIGEN_GNUC_AT_LEAST(3,4)
  typedef typename MemberOp::template Cost<InputScalar,int(TraversalSize)> CostOpType;
@@ -58,7 +58,8 @@ struct traits<PartialReduxExpr<MatrixType, MemberOp, Direction> >
  typedef typename MemberOp::template Cost<InputScalar,TraversalSize> CostOpType;
  #endif
  enum {
-    CoeffReadCost = TraversalSize * traits<_MatrixTypeNested>::CoeffReadCost + int(CostOpType::value)
+    CoeffReadCost = TraversalSize==Dynamic ? Dynamic
+                  : TraversalSize * traits<_MatrixTypeNested>::CoeffReadCost + int(CostOpType::value)
  };
 };
 }
@@ -301,6 +302,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

    /** \returns a row (or column) vector expression of the squared norm
      * of each column (or row) of the referenced expression.
+      * This is a vector with real entries, even if the original matrix has complex entries.
      *
      * Example: \include PartialRedux_squaredNorm.cpp
      * Output: \verbinclude PartialRedux_squaredNorm.out
@@ -311,6 +313,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

    /** \returns a row (or column) vector expression of the norm
      * of each column (or row) of the referenced expression.
+      * This is a vector with real entries, even if the original matrix has complex entries.
      *
      * Example: \include PartialRedux_norm.cpp
      * Output: \verbinclude PartialRedux_norm.out
@@ -322,7 +325,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

    /** \returns a row (or column) vector expression of the norm
      * of each column (or row) of the referenced expression, using
-      * blue's algorithm.
+      * Blue's algorithm. 
+      * This is a vector with real entries, even if the original matrix has complex entries.
      *
      * \sa DenseBase::blueNorm() */
    const typename ReturnType<internal::member_blueNorm,RealScalar>::Type blueNorm() const
@@ -332,6 +336,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    /** \returns a row (or column) vector expression of the norm
      * of each column (or row) of the referenced expression, avoiding
      * underflow and overflow.
+      * This is a vector with real entries, even if the original matrix has complex entries.
      *
      * \sa DenseBase::stableNorm() */
    const typename ReturnType<internal::member_stableNorm,RealScalar>::Type stableNorm() const
@@ -341,6 +346,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    /** \returns a row (or column) vector expression of the norm
      * of each column (or row) of the referenced expression, avoiding
      * underflow and overflow using a concatenation of hypot() calls.
+      * This is a vector with real entries, even if the original matrix has complex entries.
      *
      * \sa DenseBase::hypotNorm() */
    const typename ReturnType<internal::member_hypotNorm,RealScalar>::Type hypotNorm() const
@@ -365,6 +371,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

    /** \returns a row (or column) vector expression representing
      * whether \b all coefficients of each respective column (or row) are \c true.
+      * This expression can be assigned to a vector with entries of type \c bool.
      *
      * \sa DenseBase::all() */
    const typename ReturnType<internal::member_all>::Type all() const
@@ -372,6 +379,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

    /** \returns a row (or column) vector expression representing
      * whether \b at \b least one coefficient of each respective column (or row) is \c true.
+      * This expression can be assigned to a vector with entries of type \c bool.
      *
      * \sa DenseBase::any() */
    const typename ReturnType<internal::member_any>::Type any() const
@@ -379,6 +387,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

    /** \returns a row (or column) vector expression representing
      * the number of \c true coefficients of each respective column (or row).
+      * This expression can be assigned to a vector whose entries have the same type as is used to
+      * index entries of the original matrix; for dense matrices, this is \c std::ptrdiff_t .
      *
      * Example: \include PartialRedux_count.cpp
      * Output: \verbinclude PartialRedux_count.out
@@ -550,9 +560,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

 /////////// Geometry module ///////////

-    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
    Homogeneous<ExpressionType,Direction> homogeneous() const;
-    #endif

    typedef typename ExpressionType::PlainObject CrossReturnType;
    template<typename OtherDerived>
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -194,7 +194,7 @@ DenseBase<Derived>::minCoeff(IndexType* index) const
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  internal::min_coeff_visitor<Derived> minVisitor;
  this->visit(minVisitor);
-  *index = (RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row;
+  *index = IndexType((RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row);
  return minVisitor.res;
 }

--- a/Eigen/src/Core/arch/AVX/CMakeLists.txt
+++ b/Eigen/src/Core/arch/AVX/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_Core_arch_AVX_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_Core_arch_AVX_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/AVX COMPONENT Devel
+)
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -0,0 +1,463 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_AVX_H
+#define EIGEN_COMPLEX_AVX_H
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- float ----------
+struct Packet4cf
+{
+  EIGEN_STRONG_INLINE Packet4cf() {}
+  EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {}
+  __m256  v;
+};
+
+template<> struct packet_traits<std::complex<float> >  : default_packet_traits
+{
+  typedef Packet4cf type;
+  typedef Packet2cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 1,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4}; typedef Packet2cf half; };
+
+template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a)
+{
+  return Packet4cf(pnegate(a.v));
+}
+template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a)
+{
+  const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
+  return Packet4cf(_mm256_xor_ps(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
+{
+  __m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v);
+  __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
+  __m256 result = _mm256_addsub_ps(tmp1, tmp2);
+  return Packet4cf(result);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf pand   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf por    <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pxor   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); }
+template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); }
+
+
+template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
+{
+  return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
+{
+  // FIXME The following might be optimized using _mm256_movedup_pd
+  Packet2cf a = ploaddup<Packet2cf>(from);
+  Packet2cf b = ploaddup<Packet2cf>(from+1);
+  return  Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from, DenseIndex stride)
+{
+  return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]),
+                                 std::imag(from[2*stride]), std::real(from[2*stride]),
+                                 std::imag(from[1*stride]), std::real(from[1*stride]),
+                                 std::imag(from[0*stride]), std::real(from[0*stride])));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, DenseIndex stride)
+{
+  __m128 low = _mm256_extractf128_ps(from.v, 0);
+  to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
+  to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
+
+  __m128 high = _mm256_extractf128_ps(from.v, 1);
+  to[stride*2] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
+  to[stride*3] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
+
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet4cf>(const Packet4cf& a)
+{
+  return pfirst(Packet2cf(_mm256_castps256_ps128(a.v)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) {
+  __m128 low  = _mm256_extractf128_ps(a.v, 0);
+  __m128 high = _mm256_extractf128_ps(a.v, 1);
+  __m128d lowd  = _mm_castps_pd(low);
+  __m128d highd = _mm_castps_pd(high);
+  low  = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1));
+  high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1));
+  __m256 result = _mm256_setzero_ps();
+  result = _mm256_insertf128_ps(result, low, 1);
+  result = _mm256_insertf128_ps(result, high, 0);
+  return Packet4cf(result);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a)
+{
+  return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v,0)),
+                     Packet2cf(_mm256_extractf128_ps(a.v,1))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf preduxp<Packet4cf>(const Packet4cf* vecs)
+{
+  Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0));
+  Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0));
+  t0 = _mm256_hadd_ps(t0,t1);
+  Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0));
+  Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0));
+  t2 = _mm256_hadd_ps(t2,t3);
+  
+  t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4));
+  t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4));
+
+  return Packet4cf(_mm256_add_ps(t1,t3));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a)
+{
+  return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)),
+                         Packet2cf(_mm256_extractf128_ps(a.v, 1))));
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet4cf>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second)
+  {
+    if (Offset==0) return;
+    palign_impl<Offset*2,Packet8f>::run(first.v, second.v);
+  }
+};
+
+template<> struct conj_helper<Packet4cf, Packet4cf, false,true>
+{
+  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
+  {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template<> struct conj_helper<Packet4cf, Packet4cf, true,false>
+{
+  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
+  {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
+{
+  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
+  {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+template<> struct conj_helper<Packet8f, Packet4cf, false,false>
+{
+  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
+  { return Packet4cf(Eigen::internal::pmul(x, y.v)); }
+};
+
+template<> struct conj_helper<Packet4cf, Packet8f, false,false>
+{
+  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
+  { return Packet4cf(Eigen::internal::pmul(x.v, y)); }
+};
+
+template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
+{
+  Packet4cf num = pmul(a, pconj(b));
+  __m256 tmp = _mm256_mul_ps(b.v, b.v);
+  __m256 tmp2    = _mm256_shuffle_ps(tmp,tmp,0xB1);
+  __m256 denom = _mm256_add_ps(tmp, tmp2);
+  return Packet4cf(_mm256_div_ps(num.v, denom));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x)
+{
+  return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
+}
+
+//---------- double ----------
+struct Packet2cd
+{
+  EIGEN_STRONG_INLINE Packet2cd() {}
+  EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {}
+  __m256d  v;
+};
+
+template<> struct packet_traits<std::complex<double> >  : default_packet_traits
+{
+  typedef Packet2cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 2,
+    HasHalfPacket = 1,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2}; typedef Packet1cd half; };
+
+template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a)
+{
+  const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
+  return Packet2cd(_mm256_xor_pd(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
+{
+  __m256d tmp1 = _mm256_shuffle_pd(a.v,a.v,0x0);
+  __m256d even = _mm256_mul_pd(tmp1, b.v);
+  __m256d tmp2 = _mm256_shuffle_pd(a.v,a.v,0xF);
+  __m256d tmp3 = _mm256_shuffle_pd(b.v,b.v,0x5);
+  __m256d odd  = _mm256_mul_pd(tmp2, tmp3);
+  return Packet2cd(_mm256_addsub_pd(even, odd));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd pand   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd por    <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pxor   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from)
+{
+  // in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though)
+//   return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from));
+    return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) { return pset1<Packet2cd>(*from); }
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from, DenseIndex stride)
+{
+  return Packet2cd(_mm256_set_pd(std::imag(from[1*stride]), std::real(from[1*stride]),
+				 std::imag(from[0*stride]), std::real(from[0*stride])));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from, DenseIndex stride)
+{
+  __m128d low = _mm256_extractf128_pd(from.v, 0);
+  to[stride*0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)));
+  __m128d high = _mm256_extractf128_pd(from.v, 1);
+  to[stride*1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a)
+{
+  __m128d low = _mm256_extractf128_pd(a.v, 0);
+  EIGEN_ALIGN16 double res[2];
+  _mm_store_pd(res, low);
+  return std::complex<double>(res[0],res[1]);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) {
+  __m256d result = _mm256_permute2f128_pd(a.v, a.v, 1);
+  return Packet2cd(result);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a)
+{
+  return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v,0)),
+                     Packet1cd(_mm256_extractf128_pd(a.v,1))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd preduxp<Packet2cd>(const Packet2cd* vecs)
+{
+  Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4));
+  Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4));
+
+  return Packet2cd(_mm256_add_pd(t0,t1));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a)
+{
+  return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)),
+                     Packet1cd(_mm256_extractf128_pd(a.v,1))));
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet2cd>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second)
+  {
+    if (Offset==0) return;
+    palign_impl<Offset*2,Packet4d>::run(first.v, second.v);
+  }
+};
+
+template<> struct conj_helper<Packet2cd, Packet2cd, false,true>
+{
+  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
+  {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template<> struct conj_helper<Packet2cd, Packet2cd, true,false>
+{
+  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
+  {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
+{
+  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
+  {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+template<> struct conj_helper<Packet4d, Packet2cd, false,false>
+{
+  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
+  { return Packet2cd(Eigen::internal::pmul(x, y.v)); }
+};
+
+template<> struct conj_helper<Packet2cd, Packet4d, false,false>
+{
+  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
+  { return Packet2cd(Eigen::internal::pmul(x.v, y)); }
+};
+
+template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
+{
+  Packet2cd num = pmul(a, pconj(b));
+  __m256d tmp = _mm256_mul_pd(b.v, b.v);
+  __m256d denom = _mm256_hadd_pd(tmp, tmp);
+  return Packet2cd(_mm256_div_pd(num.v, denom));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x)
+{
+  return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4cf,4>& kernel) {
+  __m256d P0 = _mm256_castps_pd(kernel.packet[0].v);
+  __m256d P1 = _mm256_castps_pd(kernel.packet[1].v);
+  __m256d P2 = _mm256_castps_pd(kernel.packet[2].v);
+  __m256d P3 = _mm256_castps_pd(kernel.packet[3].v);
+
+  __m256d T0 = _mm256_shuffle_pd(P0, P1, 15);
+  __m256d T1 = _mm256_shuffle_pd(P0, P1, 0);
+  __m256d T2 = _mm256_shuffle_pd(P2, P3, 15);
+  __m256d T3 = _mm256_shuffle_pd(P2, P3, 0);
+
+  kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32));
+  kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49));
+  kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32));
+  kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2cd,2>& kernel) {
+  __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4));
+  kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4));
+ kernel.packet[0].v = tmp;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_AVX_H
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -0,0 +1,564 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_AVX_H
+#define EIGEN_PACKET_MATH_AVX_H
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
+#endif
+
+#ifdef EIGEN_VECTORIZE_FMA
+#ifndef EIGEN_HAS_FUSED_MADD
+#define EIGEN_HAS_FUSED_MADD 1
+#endif
+#endif
+
+typedef __m256  Packet8f;
+typedef __m256i Packet8i;
+typedef __m256d Packet4d;
+
+template<> struct is_arithmetic<__m256>  { enum { value = true }; };
+template<> struct is_arithmetic<__m256i> { enum { value = true }; };
+template<> struct is_arithmetic<__m256d> { enum { value = true }; };
+
+#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \
+  const Packet8f p8f_##NAME = pset1<Packet8f>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \
+  const Packet4d p4d_##NAME = pset1<Packet4d>(X)
+
+
+template<> struct packet_traits<float>  : default_packet_traits
+{
+  typedef Packet8f type;
+  typedef Packet4f half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=8,
+    HasHalfPacket = 1,
+
+    HasDiv  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
+    HasExp  = 0,
+    HasSqrt = 0
+  };
+ };
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef Packet4d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=4,
+    HasHalfPacket = 1,
+
+    HasDiv  = 1,
+    HasExp  = 0
+  };
+};
+
+/* Proper support for integers is only provided by AVX2. In the meantime, we'll
+   use SSE instructions and packets to deal with integers.
+template<> struct packet_traits<int>    : default_packet_traits
+{
+  typedef Packet8i type;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=8
+  };
+};
+*/
+
+template<> struct unpacket_traits<Packet8f> { typedef float  type; typedef Packet4f half; enum {size=8}; };
+template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4}; };
+template<> struct unpacket_traits<Packet8i> { typedef int    type; typedef Packet4i half; enum {size=8}; };
+
+template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float&  from) { return _mm256_set1_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int&    from) { return _mm256_set1_epi32(from); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float*  from) { return _mm256_broadcast_ss(from); }
+template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
+
+template<> EIGEN_STRONG_INLINE Packet8f plset<float>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
+template<> EIGEN_STRONG_INLINE Packet4d plset<double>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
+
+template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
+{
+  return _mm256_sub_ps(_mm256_set1_ps(0.0),a);
+}
+template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
+{
+  return _mm256_sub_pd(_mm256_set1_pd(0.0),a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); }
+
+
+template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/)
+{ eigen_assert(false && "packet integer division are not supported by AVX");
+  return pset1<Packet8i>(0);
+}
+
+#ifdef EIGEN_VECTORIZE_FMA
+template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
+#if defined(__clang__) || defined(__GNUC__)
+  // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
+  // and gcc stupidly generates a vfmadd132ps instruction,
+  // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate
+  // the result of the product.
+  Packet8f res = c;
+  __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
+  return res;
+#else
+  return _mm256_fmadd_ps(a,b,c);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
+#if defined(__clang__) || defined(__GNUC__)
+  // see above
+  Packet4d res = c;
+  __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
+  return res;
+#else
+  return _mm256_fmadd_pd(a,b,c);
+#endif
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
+
+// Loads 4 floats from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3, a3}
+template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
+{
+  // TODO try to find a way to avoid the need of a temporary register
+//   Packet8f tmp  = _mm256_castps128_ps256(_mm_loadu_ps(from));
+//   tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
+//   return _mm256_unpacklo_ps(tmp,tmp);
+  
+  // _mm256_insertf128_ps is very slow on Haswell, thus:
+  Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
+  // mimic an "inplace" permutation of the lower 128bits using a blend
+  tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
+  // then we can perform a consistent permutation on the global register to get everything in shape:
+  return  _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
+}
+// Loads 2 doubles from memory a returns the packet {a0, a0  a1, a1}
+template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
+{
+  Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
+  return  _mm256_permute_pd(tmp, 3<<2);
+}
+
+// Loads 2 floats from memory a returns the packet {a0, a0  a0, a0, a1, a1, a1, a1}
+template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from)
+{
+  Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
+  return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
+
+// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
+// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
+template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, DenseIndex stride)
+{
+  return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
+                       from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, DenseIndex stride)
+{
+  return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, DenseIndex stride)
+{
+  __m128 low = _mm256_extractf128_ps(from, 0);
+  to[stride*0] = _mm_cvtss_f32(low);
+  to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
+  to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
+  to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
+
+  __m128 high = _mm256_extractf128_ps(from, 1);
+  to[stride*4] = _mm_cvtss_f32(high);
+  to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
+  to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
+  to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, DenseIndex stride)
+{
+  __m128d low = _mm256_extractf128_pd(from, 0);
+  to[stride*0] = _mm_cvtsd_f64(low);
+  to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
+  __m128d high = _mm256_extractf128_pd(from, 1);
+  to[stride*2] = _mm_cvtsd_f64(high);
+  to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a)
+{
+  Packet8f pa = pset1<Packet8f>(a);
+  pstore(to, pa);
+}
+template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a)
+{
+  Packet4d pa = pset1<Packet4d>(a);
+  pstore(to, pa);
+}
+template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
+{
+  Packet8i pa = pset1<Packet8i>(a);
+  pstore(to, pa);
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet8f>(const Packet8f& a) {
+  return _mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
+template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
+  return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
+}
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet8i>(const Packet8i& a) {
+  return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a)
+{
+  __m256 tmp = _mm256_shuffle_ps(a,a,0x1b);
+  return _mm256_permute2f128_ps(tmp, tmp, 1);
+}
+template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
+{
+   __m256d tmp = _mm256_shuffle_pd(a,a,5);
+  return _mm256_permute2f128_pd(tmp, tmp, 1);
+
+  __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
+    return _mm256_permute_pd(swap_halves,5);
+}
+
+// pabs should be ok
+template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a)
+{
+  const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
+  return _mm256_and_ps(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
+{
+  const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
+  return _mm256_and_pd(a,mask);
+}
+
+// preduxp should be ok
+// FIXME: why is this ok? why isn't the simply implementation working as expected?
+template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
+{
+    __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]);
+    __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]);
+    __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]);
+    __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]);
+
+    __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
+    __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
+    __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
+    __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
+
+    __m256 perm1 =  _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
+    __m256 perm2 =  _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
+    __m256 perm3 =  _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
+    __m256 perm4 =  _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
+
+    __m256 sum1 = _mm256_add_ps(perm1, hsum5);
+    __m256 sum2 = _mm256_add_ps(perm2, hsum6);
+    __m256 sum3 = _mm256_add_ps(perm3, hsum7);
+    __m256 sum4 = _mm256_add_ps(perm4, hsum8);
+
+    __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
+    __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
+
+    __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
+    return final;
+}
+template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs)
+{
+ Packet4d tmp0, tmp1;
+
+  tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]);
+  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
+
+  tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]);
+  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
+
+  return _mm256_blend_pd(tmp0, tmp1, 0xC);
+}
+
+template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
+{
+  Packet8f tmp0 = _mm256_hadd_ps(a,_mm256_permute2f128_ps(a,a,1));
+  tmp0 = _mm256_hadd_ps(tmp0,tmp0);
+  return pfirst(_mm256_hadd_ps(tmp0, tmp0));
+}
+template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
+{
+  Packet4d tmp0 = _mm256_hadd_pd(a,_mm256_permute2f128_pd(a,a,1));
+  return pfirst(_mm256_hadd_pd(tmp0,tmp0));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f predux4<Packet8f>(const Packet8f& a)
+{
+  return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a)
+{
+  Packet8f tmp;
+  tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1));
+  tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
+  return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
+}
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a)
+{
+  Packet4d tmp;
+  tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1));
+  return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1)));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a)
+{
+  Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1));
+  tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
+  return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
+}
+template<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a)
+{
+  Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1));
+  return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a)
+{
+  Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1));
+  tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
+  return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
+}
+
+template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
+{
+  Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1));
+  return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
+}
+
+
+template<int Offset>
+struct palign_impl<Offset,Packet8f>
+{
+  static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second)
+  {
+    if (Offset==1)
+    {
+      first = _mm256_blend_ps(first, second, 1);
+      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
+      first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0x88);
+    }
+    else if (Offset==2)
+    {
+      first = _mm256_blend_ps(first, second, 3);
+      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
+      first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xcc);
+    }
+    else if (Offset==3)
+    {
+      first = _mm256_blend_ps(first, second, 7);
+      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
+      first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xee);
+    }
+    else if (Offset==4)
+    {
+      first = _mm256_blend_ps(first, second, 15);
+      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
+      first = _mm256_permute_ps(_mm256_permute2f128_ps (tmp, tmp, 1), _MM_SHUFFLE(3,2,1,0));
+    }
+    else if (Offset==5)
+    {
+      first = _mm256_blend_ps(first, second, 31);
+      first = _mm256_permute2f128_ps(first, first, 1);
+      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
+      first = _mm256_permute2f128_ps(tmp, tmp, 1);
+      first = _mm256_blend_ps(tmp, first, 0x88);
+    }
+    else if (Offset==6)
+    {
+      first = _mm256_blend_ps(first, second, 63);
+      first = _mm256_permute2f128_ps(first, first, 1);
+      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
+      first = _mm256_permute2f128_ps(tmp, tmp, 1);
+      first = _mm256_blend_ps(tmp, first, 0xcc);
+    }
+    else if (Offset==7)
+    {
+      first = _mm256_blend_ps(first, second, 127);
+      first = _mm256_permute2f128_ps(first, first, 1);
+      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
+      first = _mm256_permute2f128_ps(tmp, tmp, 1);
+      first = _mm256_blend_ps(tmp, first, 0xee);
+    }
+  }
+};
+
+template<int Offset>
+struct palign_impl<Offset,Packet4d>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second)
+  {
+    if (Offset==1)
+    {
+      first = _mm256_blend_pd(first, second, 1);
+      __m256d tmp = _mm256_permute_pd(first, 5);
+      first = _mm256_permute2f128_pd(tmp, tmp, 1);
+      first = _mm256_blend_pd(tmp, first, 0xA);
+    }
+    else if (Offset==2)
+    {
+      first = _mm256_blend_pd(first, second, 3);
+      first = _mm256_permute2f128_pd(first, first, 1);
+    }
+    else if (Offset==3)
+    {
+      first = _mm256_blend_pd(first, second, 7);
+      __m256d tmp = _mm256_permute_pd(first, 5);
+      first = _mm256_permute2f128_pd(tmp, tmp, 1);
+      first = _mm256_blend_pd(tmp, first, 5);
+    }
+  }
+};
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8f,8>& kernel) {
+  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+  __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
+  __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
+  __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
+  __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
+  __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
+  __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
+  __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
+  __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
+  __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0));
+  __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2));
+  __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0));
+  __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2));
+  kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20);
+  kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20);
+  kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20);
+  kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20);
+  kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31);
+  kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31);
+  kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31);
+  kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8f,4>& kernel) {
+  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+
+  __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
+  __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
+  __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
+  __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
+
+  kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20);
+  kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20);
+  kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31);
+  kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4d,4>& kernel) {
+  __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
+  __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
+  __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);
+  __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
+
+  kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32);
+  kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49);
+  kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32);
+  kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PACKET_MATH_AVX_H
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -16,11 +16,14 @@ namespace internal {

 static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_ZERO_);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
 static Packet16uc p16uc_COMPLEX_RE   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
-static Packet16uc p16uc_COMPLEX_IM   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+static Packet16uc p16uc_COMPLEX_IM   = vec_sld(p16uc_DUPLICATE, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
 static Packet16uc p16uc_COMPLEX_REV  = vec_sld(p16uc_REVERSE, p16uc_REVERSE, 8);//{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
 static Packet16uc p16uc_COMPLEX_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
-static Packet16uc p16uc_PSET_HI = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 1));//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
-static Packet16uc p16uc_PSET_LO = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 2), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 3));//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
+static Packet16uc p16uc_PSET_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_COMPLEX_RE, (Packet4ui)p16uc_COMPLEX_IM);//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_PSET_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_COMPLEX_RE, (Packet4ui)p16uc_COMPLEX_IM);//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
+static Packet16uc p16uc_COMPLEX_MASK16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);//{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
+static Packet16uc p16uc_COMPLEX_TRANSPOSE_0 = vec_add(p16uc_PSET_HI, p16uc_COMPLEX_MASK16);//{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
+static Packet16uc p16uc_COMPLEX_TRANSPOSE_1 = vec_add(p16uc_PSET_LO, p16uc_COMPLEX_MASK16);//{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};

 //---------- float ----------
 struct Packet2cf
@@ -33,6 +36,7 @@ struct Packet2cf
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
  typedef Packet2cf type;
+  typedef Packet2cf half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
@@ -51,7 +55,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
  };
 };

-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };

 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
@@ -65,6 +69,22 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo
  return res;
 }

+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, DenseIndex stride)
+{
+  std::complex<float> EIGEN_ALIGN16 af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  return Packet2cf(vec_ld(0, (const float*)af));
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, DenseIndex stride)
+{
+  std::complex<float> EIGEN_ALIGN16 af[2];
+  vec_st(from.v, 0, (float*)af);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
+
+
 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
@@ -210,6 +230,13 @@ template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x
  return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX_REV));
 }

+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
+{
+  Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_0);
+  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_1);
+  kernel.packet[0].v = tmp;
+}
+
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Konstantinos Margaritis <markos@codex.gr>
+// Copyright (C) 2008-2014 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -18,6 +18,10 @@ namespace internal {
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
 #endif

+#ifndef EIGEN_HAS_FUSED_MADD
+#define EIGEN_HAS_FUSED_MADD 1
+#endif
+
 #ifndef EIGEN_HAS_FUSE_CJMADD
 #define EIGEN_HAS_FUSE_CJMADD 1
 #endif
@@ -56,29 +60,32 @@ typedef __vector unsigned char  Packet16uc;
 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))

 // Define global static constants:
-static Packet4f p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 };
-static Packet4i p4i_COUNTDOWN = { 3, 2, 1, 0 };
-static Packet16uc p16uc_REVERSE = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
-static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
-static Packet16uc p16uc_DUPLICATE = {0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7};
+static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
+static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
+static Packet16uc p16uc_REVERSE = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
+static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); //{ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}
+static Packet16uc p16uc_DUPLICATE = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7};

-static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1);
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16);
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1);
-static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0);
-static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1);
+static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
+static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
+static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}

 template<> struct packet_traits<float>  : default_packet_traits
 {
  typedef Packet4f type;
+  typedef Packet4f half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size=4,
+    HasHalfPacket=0,

    // FIXME check the Has*
+    HasDiv  = 1,
    HasSin  = 0,
    HasCos  = 0,
    HasLog  = 0,
@@ -89,6 +96,7 @@ template<> struct packet_traits<float>  : default_packet_traits
 template<> struct packet_traits<int>    : default_packet_traits
 {
  typedef Packet4i type;
+  typedef Packet4i half;
  enum {
    // FIXME check the Has*
    Vectorizable = 1,
@@ -97,8 +105,8 @@ template<> struct packet_traits<int>    : default_packet_traits
  };
 };

-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; typedef Packet4i half; };
 /*
 inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
 {
@@ -144,6 +152,7 @@ inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
  return s;
 }
 */
+
 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
  float EIGEN_ALIGN16 af[4];
@@ -161,6 +170,65 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
  return vc;
 }

+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  a3 = vec_ld(0,a);
+  a0 = vec_splat(a3, 0);
+  a1 = vec_splat(a3, 1);
+  a2 = vec_splat(a3, 2);
+  a3 = vec_splat(a3, 3);
+}
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4i>(const int *a,
+                      Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
+{
+  a3 = vec_ld(0,a);
+  a0 = vec_splat(a3, 0);
+  a1 = vec_splat(a3, 1);
+  a2 = vec_splat(a3, 2);
+  a3 = vec_splat(a3, 3);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, DenseIndex stride)
+{
+  float EIGEN_ALIGN16 af[4];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  af[2] = from[2*stride];
+  af[3] = from[3*stride];
+ return vec_ld(0, af);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, DenseIndex stride)
+{
+  int EIGEN_ALIGN16 ai[4];
+  ai[0] = from[0*stride];
+  ai[1] = from[1*stride];
+  ai[2] = from[2*stride];
+  ai[3] = from[3*stride];
+ return vec_ld(0, ai);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, DenseIndex stride)
+{
+  float EIGEN_ALIGN16 af[4];
+  vec_st(from, 0, af);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+  to[2*stride] = af[2];
+  to[3*stride] = af[3];
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, DenseIndex stride)
+{
+  int EIGEN_ALIGN16 ai[4];
+  vec_st(from, 0, ai);
+  to[0*stride] = ai[0];
+  to[1*stride] = ai[1];
+  to[2*stride] = ai[2];
+  to[3*stride] = ai[3];
+}
+
 template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
 template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a)     { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }

@@ -286,15 +354,15 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
 {
  Packet4f p;
-  if((ptrdiff_t(&from) % 16) == 0)  p = pload<Packet4f>(from);
-  else                              p = ploadu<Packet4f>(from);
+  if((ptrdiff_t(from) % 16) == 0)  p = pload<Packet4f>(from);
+  else                             p = ploadu<Packet4f>(from);
  return vec_perm(p, p, p16uc_DUPLICATE);
 }
 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
 {
  Packet4i p;
-  if((ptrdiff_t(&from) % 16) == 0)  p = pload<Packet4i>(from);
-  else                              p = ploadu<Packet4i>(from);
+  if((ptrdiff_t(from) % 16) == 0)  p = pload<Packet4i>(from);
+  else                             p = ploadu<Packet4i>(from);
  return vec_perm(p, p, p16uc_DUPLICATE);
 }

@@ -494,6 +562,32 @@ struct palign_impl<Offset,Packet4i>
  }
 };

+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  Packet4f t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4i,4>& kernel) {
+  Packet4i t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/arch/CMakeLists.txt
+++ b/Eigen/src/Core/arch/CMakeLists.txt
@@ -1,4 +1,5 @@
 ADD_SUBDIRECTORY(SSE)
 ADD_SUBDIRECTORY(AltiVec)
 ADD_SUBDIRECTORY(NEON)
+ADD_SUBDIRECTORY(AVX)
 ADD_SUBDIRECTORY(Default)
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -28,6 +28,7 @@ struct Packet2cf
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
  typedef Packet2cf type;
+  typedef Packet2cf half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
@@ -46,7 +47,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
  };
 };

-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };

 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
@@ -110,7 +111,23 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }

-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { __pld((float *)addr); }
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, DenseIndex stride)
+{
+  Packet4f res;
+  res = vsetq_lane_f32(std::real(from[0*stride]), res, 0);
+  res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1);
+  res = vsetq_lane_f32(std::real(from[1*stride]), res, 2);
+  res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3);
+  return Packet2cf(res);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, DenseIndex stride)
+{
+  to[stride*0] = std::complex<float>(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1));
+  to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { EIGEN_ARM_PREFETCH((float *)addr); }

 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
@@ -246,6 +263,14 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
  return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
 }

+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2cf,2>& kernel) {
+  float32x4_t tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v));
+  kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v));
+  kernel.packet[1].v = tmp;
+}
+
+
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -48,14 +48,25 @@ typedef uint32x4_t  Packet4ui;
  #define EIGEN_INIT_NEON_PACKET2(X, Y)       {X, Y}
  #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W}
 #endif
-    
-#ifndef __pld
-#define __pld(x) asm volatile ( "   pld [%[addr]]\n" :: [addr] "r" (x) : "cc" );
+
+
+// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
+// which available on LLVM and GCC (at least)
+#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || defined(__GNUC__)
+  #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
+#elif defined __pld
+  #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
+#elif !defined(__aarch64__)
+  #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( "   pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+#else
+  // by default no explicit prefetching
+  #define EIGEN_ARM_PREFETCH(ADDR)
 #endif

 template<> struct packet_traits<float>  : default_packet_traits
 {
  typedef Packet4f type;
+  typedef Packet4f half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
@@ -73,6 +84,7 @@ template<> struct packet_traits<float>  : default_packet_traits
 template<> struct packet_traits<int>    : default_packet_traits
 {
  typedef Packet4i type;
+  typedef Packet4i half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
@@ -85,12 +97,13 @@ template<> struct packet_traits<int>    : default_packet_traits
 // workaround gcc 4.2, 4.3 and 4.4 compilatin issue
 EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
 EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); }
+EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); }
 EIGEN_STRONG_INLINE void        vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
 EIGEN_STRONG_INLINE void        vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
 #endif

-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; typedef Packet4i half; };

 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return vdupq_n_f32(from); }
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   { return vdupq_n_s32(from); }
@@ -209,8 +222,42 @@ template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& f
 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }

-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { __pld(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { __pld(addr); }
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, DenseIndex stride)
+{
+  Packet4f res;
+  res = vsetq_lane_f32(from[0*stride], res, 0);
+  res = vsetq_lane_f32(from[1*stride], res, 1);
+  res = vsetq_lane_f32(from[2*stride], res, 2);
+  res = vsetq_lane_f32(from[3*stride], res, 3);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, DenseIndex stride)
+{
+  Packet4i res;
+  res = vsetq_lane_s32(from[0*stride], res, 0);
+  res = vsetq_lane_s32(from[1*stride], res, 1);
+  res = vsetq_lane_s32(from[2*stride], res, 2);
+  res = vsetq_lane_s32(from[3*stride], res, 3);
+  return res;
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, DenseIndex stride)
+{
+  to[stride*0] = vgetq_lane_f32(from, 0);
+  to[stride*1] = vgetq_lane_f32(from, 1);
+  to[stride*2] = vgetq_lane_f32(from, 2);
+  to[stride*3] = vgetq_lane_f32(from, 3);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, DenseIndex stride)
+{
+  to[stride*0] = vgetq_lane_s32(from, 0);
+  to[stride*1] = vgetq_lane_s32(from, 1);
+  to[stride*2] = vgetq_lane_s32(from, 2);
+  to[stride*3] = vgetq_lane_s32(from, 3);
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { EIGEN_ARM_PREFETCH(addr); }

 // FIXME only store the 2 first elements ?
 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
@@ -375,6 +422,7 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
  a_lo = vget_low_s32(a);
  a_hi = vget_high_s32(a);
  max = vpmax_s32(a_lo, a_hi);
+  max = vpmax_s32(max, max);

  return vget_lane_s32(max, 0);
 }
@@ -400,9 +448,30 @@ PALIGN_NEON(0,Packet4i,vextq_s32)
 PALIGN_NEON(1,Packet4i,vextq_s32)
 PALIGN_NEON(2,Packet4i,vextq_s32)
 PALIGN_NEON(3,Packet4i,vextq_s32)
-    
+
 #undef PALIGN_NEON

+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]);
+  float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]);
+
+  kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0]));
+  kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0]));
+  kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1]));
+  kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1]));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4i,4>& kernel) {
+  int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]);
+  int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]);
+  kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0]));
+  kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0]));
+  kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1]));
+  kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
+}
+
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -22,13 +22,18 @@ struct Packet2cf
  __m128  v;
 };

+// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
+// to leverage AVX instructions.
+#ifndef EIGEN_VECTORIZE_AVX
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
  typedef Packet2cf type;
+  typedef Packet2cf half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size = 2,
+    HasHalfPacket = 0,

    HasAdd    = 1,
    HasSub    = 1,
@@ -42,8 +47,9 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
    HasSetLinear = 0
  };
 };
+#endif

-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };

 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
@@ -104,8 +110,23 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo

 template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }

-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); }
+
+
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, DenseIndex stride)
+{
+  return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]),
+                              std::imag(from[0*stride]), std::real(from[0*stride])));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, DenseIndex stride)
+{
+  to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1)));
+  to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
+}

 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }

@@ -124,7 +145,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Pack
  #endif
 }

-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(_mm_castps_pd(a.v)))); }
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); }

 template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
 {
@@ -214,7 +235,7 @@ template<> struct conj_helper<Packet4f, Packet2cf, false,false>
  { return padd(c, pmul(x,y)); }

  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
-  { return Packet2cf(Eigen::internal::pmul(x, y.v)); }
+  { return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
 };

 template<> struct conj_helper<Packet2cf, Packet4f, false,false>
@@ -223,7 +244,7 @@ template<> struct conj_helper<Packet2cf, Packet4f, false,false>
  { return padd(c, pmul(x,y)); }

  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
-  { return Packet2cf(Eigen::internal::pmul(x.v, y)); }
+  { return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
 };

 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
@@ -248,13 +269,18 @@ struct Packet1cd
  __m128d  v;
 };

+// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
+// to leverage AVX instructions.
+#ifndef EIGEN_VECTORIZE_AVX
 template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 {
  typedef Packet1cd type;
+  typedef Packet1cd half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 0,
    size = 1,
+    HasHalfPacket = 0,

    HasAdd    = 1,
    HasSub    = 1,
@@ -268,12 +294,13 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
    HasSetLinear = 0
  };
 };
+#endif

-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; };

 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
 {
  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
@@ -311,8 +338,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<dou
 template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }

 // FIXME force unaligned store, this is a temporary fix
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }

 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }

@@ -410,7 +437,7 @@ template<> struct conj_helper<Packet2d, Packet1cd, false,false>
  { return padd(c, pmul(x,y)); }

  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
-  { return Packet1cd(Eigen::internal::pmul(x, y.v)); }
+  { return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
 };

 template<> struct conj_helper<Packet1cd, Packet2d, false,false>
@@ -419,7 +446,7 @@ template<> struct conj_helper<Packet1cd, Packet2d, false,false>
  { return padd(c, pmul(x,y)); }

  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
-  { return Packet1cd(Eigen::internal::pmul(x.v, y)); }
+  { return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
 };

 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
@@ -432,7 +459,17 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, con

 EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
 {
-  return Packet1cd(preverse(x.v));
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2cf,2>& kernel) {
+  __m128d w1 = _mm_castps_pd(kernel.packet[0].v);
+  __m128d w2 = _mm_castps_pd(kernel.packet[1].v);
+
+  __m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2));
+  kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2));
+  kernel.packet[1].v = tmp;
 }

 } // end namespace internal
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -63,7 +63,7 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
  x = _mm_or_ps(x, p4f_half);

  emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
-  Packet4f e = padd(_mm_cvtepi32_ps(emm0), p4f_1);
+  Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1);

  /* part2:
     if( x < SQRTHF ) {
@@ -72,9 +72,9 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
     } else { x = x - 1.0; }
  */
  Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
-  Packet4f tmp = _mm_and_ps(x, mask);
+  Packet4f tmp = pand(x, mask);
  x = psub(x, p4f_1);
-  e = psub(e, _mm_and_ps(p4f_1, mask));
+  e = psub(e, pand(p4f_1, mask));
  x = padd(x, tmp);

  Packet4f x2 = pmul(x,x);
@@ -126,7 +126,7 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);

-  Packet4f tmp = _mm_setzero_ps(), fx;
+  Packet4f tmp, fx;
  Packet4i emm0;

  // clamp x
@@ -138,6 +138,7 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
 #ifdef EIGEN_VECTORIZE_SSE4_1
  fx = _mm_floor_ps(fx);
 #else
+  tmp = _mm_setzero_ps();
  emm0 = _mm_cvttps_epi32(fx);
  tmp  = _mm_cvtepi32_ps(emm0);
  /* if greater, substract 1 */
@@ -166,7 +167,7 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
  emm0 = _mm_cvttps_epi32(fx);
  emm0 = _mm_add_epi32(emm0, p4i_0x7f);
  emm0 = _mm_slli_epi32(emm0, 23);
-  return pmul(y, _mm_castsi128_ps(emm0));
+  return pmul(y, Packet4f(_mm_castsi128_ps(emm0)));
 }
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d pexp<Packet2d>(const Packet2d& _x)
@@ -195,7 +196,7 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
  static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);

-  Packet2d tmp = _mm_setzero_pd(), fx;
+  Packet2d tmp, fx;
  Packet4i emm0;

  // clamp x
@@ -206,6 +207,7 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
 #ifdef EIGEN_VECTORIZE_SSE4_1
  fx = _mm_floor_pd(fx);
 #else
+  tmp = _mm_setzero_pd();
  emm0 = _mm_cvttpd_epi32(fx);
  tmp  = _mm_cvtepi32_pd(emm0);
  /* if greater, substract 1 */
@@ -239,7 +241,7 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
  emm0 = _mm_add_epi32(emm0, p4i_1023_0);
  emm0 = _mm_slli_epi32(emm0, 20);
  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
-  return pmul(x, _mm_castsi128_pd(emm0));
+  return pmul(x, Packet2d(_mm_castsi128_pd(emm0)));
 }

 /* evaluation of 4 sines at onces, using SSE2 intrinsics.
@@ -279,7 +281,7 @@ Packet4f psin<Packet4f>(const Packet4f& _x)
  _EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI

-  Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+  Packet4f xmm1, xmm2, xmm3, sign_bit, y;

  Packet4i emm0, emm2;
  sign_bit = x;
@@ -378,7 +380,7 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
  _EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI

-  Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+  Packet4f xmm1, xmm2, xmm3, y;
  Packet4i emm0, emm2;

  x = pabs(x);
@@ -442,8 +444,11 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
  return _mm_xor_ps(y, sign_bit);
 }

+#if EIGEN_FAST_MATH
+
 // This is based on Quake3's fast inverse square root.
 // For detail see here: http://www.beyond3d.com/content/articles/8/
+// It lacks 1 (or 2 bits in some rare cases) of precision, and does not handle negative, +inf, or denormalized numbers correctly.
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f psqrt<Packet4f>(const Packet4f& _x)
 {
@@ -457,6 +462,14 @@ Packet4f psqrt<Packet4f>(const Packet4f& _x)
  return pmul(_x,x);
 }

+#else
+
+template<> EIGEN_STRONG_INLINE Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
+
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
+
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -22,9 +22,41 @@ namespace internal {
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
 #endif

+#ifdef EIGEN_VECTORIZE_FMA
+#ifndef EIGEN_HAS_FUSED_MADD
+#define EIGEN_HAS_FUSED_MADD 1
+#endif
+#endif
+
+#if defined EIGEN_VECTORIZE_AVX && defined __GNUC__ && !(defined __clang__ || defined __INTEL_COMPILER)
+// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
+// have overloads for both types without linking error.
+// One solution is to increase ABI version using -fabi-version=4 (or greater).
+// To workaround this inconvenince, we rather wrap 128bit types into the following helper
+// structure:
+// TODO disable this wrapper if abi-versio>=4, but to detect that without asking the user to define a macro?
+template<typename T>
+struct eigen_packet_wrapper
+{
+  EIGEN_ALWAYS_INLINE operator T&() { return m_val; }
+  EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; }
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {}
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {}
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) {
+    m_val = v;
+    return *this;
+  }
+  
+  T m_val;
+};
+typedef eigen_packet_wrapper<__m128>  Packet4f;
+typedef eigen_packet_wrapper<__m128i> Packet4i;
+typedef eigen_packet_wrapper<__m128d> Packet2d;
+#else
 typedef __m128  Packet4f;
 typedef __m128i Packet4i;
 typedef __m128d Packet2d;
+#endif

 template<> struct is_arithmetic<__m128>  { enum { value = true }; };
 template<> struct is_arithmetic<__m128i> { enum { value = true }; };
@@ -58,13 +90,18 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; };
  const Packet4i p4i_##NAME = pset1<Packet4i>(X)


+// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
+// to leverage AVX instructions.
+#ifndef EIGEN_VECTORIZE_AVX
 template<> struct packet_traits<float>  : default_packet_traits
 {
  typedef Packet4f type;
+  typedef Packet4f half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size=4,
+    HasHalfPacket = 0,

    HasDiv  = 1,
    HasSin  = EIGEN_FAST_MATH,
@@ -77,18 +114,23 @@ template<> struct packet_traits<float>  : default_packet_traits
 template<> struct packet_traits<double> : default_packet_traits
 {
  typedef Packet2d type;
+  typedef Packet2d half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size=2,
+    HasHalfPacket = 0,

    HasDiv  = 1,
-    HasExp  = 1
+    HasExp  = 1,
+    HasSqrt = 1
  };
 };
+#endif
 template<> struct packet_traits<int>    : default_packet_traits
 {
  typedef Packet4i type;
+  typedef Packet4i half;
  enum {
    // FIXME check the Has*
    Vectorizable = 1,
@@ -97,9 +139,9 @@ template<> struct packet_traits<int>    : default_packet_traits
  };
 };

-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; typedef Packet4i half; };

 #if defined(_MSC_VER) && (_MSC_VER==1500)
 // Workaround MSVC 9 internal compiler error.
@@ -109,13 +151,26 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { re
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set_epi32(from,from,from,from); }
 #else
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set1_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set_ps1(from); }
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set1_epi32(from); }
 #endif

+// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
+// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
+// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
+// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
+// Also note that with AVX, we want it to generate a vbroadcastss.
+#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__)
+template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
+  return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
+}
+#endif
+  
+#ifndef EIGEN_VECTORIZE_AVX
 template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
 template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
+#endif
 template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }

 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
@@ -138,7 +193,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
 }
 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
 {
-  return psub(_mm_setr_epi32(0,0,0,0), a);
+  return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
 }

 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
@@ -172,6 +227,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co

 // for some weird raisons, it has to be overloaded for packet of integers
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
+#ifdef EIGEN_VECTORIZE_FMA
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
+#endif

 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
@@ -217,7 +276,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, con

 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }

 #if defined(_MSC_VER)
  template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*  from) {
@@ -235,7 +294,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { E
    #endif
  }
  template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
-  template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*    from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
+  template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*    from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); }
 #else
 // Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
 // require pointer casting to incompatible pointer types and leads to invalid code
@@ -244,14 +303,17 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { E
 // TODO: do the same for MSVC (ICC is compatible)
 // NOTE: with the code below, MSVC's compiler crashes!

-#if defined(__GNUC__) && defined(__i386__)
+#if defined(__GNUC__) && (defined(__i386__) || (defined(__x86_64) && EIGEN_GNUC_AT_LEAST(4, 8)))
  // bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd
  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
+  #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 1
 #elif defined(__clang__)
  // bug 201: Segfaults in __mm_loadh_pd with clang 2.8
  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
+  #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0
 #else
  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0
+  #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0
 #endif

 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
@@ -282,7 +344,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 {
  EIGEN_DEBUG_UNALIGNED_LOAD
 #if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
-  return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from));
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
 #else
  __m128d res;
  res =  _mm_load_sd((const double*)(from)) ;
@@ -301,38 +363,77 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*  from)
 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
 {
  Packet4i tmp;
-  tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from));
+  tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
  return vec4i_swizzle1(tmp, 0, 0, 1, 1);
 }

 template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); }
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }

 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
  EIGEN_DEBUG_UNALIGNED_STORE
+#if EIGEN_AVOID_CUSTOM_UNALIGNED_STORES
+  _mm_storeu_pd(to, from);
+#else
  _mm_storel_pd((to), from);
  _mm_storeh_pd((to+1), from);
+#endif
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castps_pd(from))); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castsi128_pd(from))); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, DenseIndex stride)
+{
+ return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, DenseIndex stride)
+{
+ return _mm_set_pd(from[1*stride], from[0*stride]);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, DenseIndex stride)
+{
+ return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+ }
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, DenseIndex stride)
+{
+  to[stride*0] = _mm_cvtss_f32(from);
+  to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
+  to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
+  to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, DenseIndex stride)
+{
+  to[stride*0] = _mm_cvtsd_f64(from);
+  to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, DenseIndex stride)
+{
+  to[stride*0] = _mm_cvtsi128_si32(from);
+  to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
+  to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
+  to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
 }
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castps_pd(from)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castsi128_pd(from)); }

 // some compilers might be tempted to perform multiple moves instead of using a vector path.
 template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
 {
  Packet4f pa = _mm_set_ss(a);
-  pstore(to, vec4f_swizzle1(pa,0,0,0,0));
+  pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0)));
 }
 // some compilers might be tempted to perform multiple moves instead of using a vector path.
 template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
 {
  Packet2d pa = _mm_set_sd(a);
-  pstore(to, vec2d_swizzle1(pa,0,0));
+  pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
 }

+#ifndef EIGEN_VECTORIZE_AVX
 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+#endif

 #if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER)
 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
@@ -379,6 +480,38 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
  #endif
 }

+// with AVX, the default implementations based on pload1 are faster
+#ifndef __AVX__
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  a3 = pload<Packet4f>(a);
+  a0 = vec4f_swizzle1(a3, 0,0,0,0);
+  a1 = vec4f_swizzle1(a3, 1,1,1,1);
+  a2 = vec4f_swizzle1(a3, 2,2,2,2);
+  a3 = vec4f_swizzle1(a3, 3,3,3,3);
+}
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet2d>(const double *a,
+                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
+{
+#ifdef EIGEN_VECTORIZE_SSE3
+  a0 = _mm_loaddup_pd(a+0);
+  a1 = _mm_loaddup_pd(a+1);
+  a2 = _mm_loaddup_pd(a+2);
+  a3 = _mm_loaddup_pd(a+3);
+#else
+  a1 = pload<Packet2d>(a);
+  a0 = vec2d_swizzle1(a1, 0,0);
+  a1 = vec2d_swizzle1(a1, 1,1);
+  a3 = pload<Packet2d>(a+2);
+  a2 = vec2d_swizzle1(a3, 0,0);
+  a3 = vec2d_swizzle1(a3, 1,1);
+#endif
+}
+#endif
+
 EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
 {
  vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
@@ -406,10 +539,10 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
  Packet4f tmp0 = _mm_hadd_ps(a,a);
-  return pfirst(_mm_hadd_ps(tmp0, tmp0));
+  return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0));
 }

-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); }
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst<Packet2d>(_mm_hadd_pd(a, a)); }

 // SSSE3 version:
 // EIGEN_STRONG_INLINE float predux(const Packet4i& a)
@@ -452,7 +585,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
  Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
-  return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1));
+  return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
 }

 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
@@ -475,11 +608,11 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
 {
  Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
-  return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+  return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
 {
-  return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
+  return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
 }
 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
 {
@@ -495,42 +628,52 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
 {
  Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
-  return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+  return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
 {
-  return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
+  return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
 }
 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
 {
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
+  return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
+#else
  // after some experiments, it is seems this is the fastest way to implement it
  // for GCC (eg., it does not like using std::min after the pstore !!)
  EIGEN_ALIGN16 int aux[4];
  pstore(aux, a);
-  register int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
-  register int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
+  int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
+  int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
  return aux0<aux2 ? aux0 : aux2;
+#endif // EIGEN_VECTORIZE_SSE4_1
 }

 // max
 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
 {
  Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
-  return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+  return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
 {
-  return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
+  return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
 }
 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
 {
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
+  return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
+#else
  // after some experiments, it is seems this is the fastest way to implement it
  // for GCC (eg., it does not like using std::min after the pstore !!)
  EIGEN_ALIGN16 int aux[4];
  pstore(aux, a);
-  register int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
-  register int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
+  int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
+  int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
  return aux0>aux2 ? aux0 : aux2;
+#endif // EIGEN_VECTORIZE_SSE4_1
 }

 #if (defined __GNUC__)
@@ -641,6 +784,31 @@ struct palign_impl<Offset,Packet2d>
 };
 #endif

+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2d,2>& kernel) {
+  __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[1] = tmp;
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4i,4>& kernel) {
+  __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
+  __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
+  __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
+  __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
+
+  kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
+  kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
+  kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
+  kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
+}
+
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -0,0 +1,167 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ASSIGNMENT_FUNCTORS_H
+#define EIGEN_ASSIGNMENT_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+  
+/** \internal
+  * \brief Template functor for scalar/packet assignment
+  *
+  */
+template<typename Scalar> struct assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a = b; }
+  
+  template<int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
+  { internal::pstoret<Scalar,Packet,Alignment>(a,b); }
+};
+template<typename Scalar>
+struct functor_traits<assign_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::ReadCost,
+    PacketAccess = packet_traits<Scalar>::IsVectorized
+  };
+};
+
+/** \internal
+  * \brief Template functor for scalar/packet assignment with addition
+  *
+  */
+template<typename Scalar> struct add_assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(add_assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a += b; }
+  
+  template<int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
+  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::padd(internal::ploadt<Packet,Alignment>(a),b)); }
+};
+template<typename Scalar>
+struct functor_traits<add_assign_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasAdd
+  };
+};
+
+/** \internal
+  * \brief Template functor for scalar/packet assignment with subtraction
+  *
+  */
+template<typename Scalar> struct sub_assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(sub_assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a -= b; }
+  
+  template<int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
+  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::psub(internal::ploadt<Packet,Alignment>(a),b)); }
+};
+template<typename Scalar>
+struct functor_traits<sub_assign_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasAdd
+  };
+};
+
+/** \internal
+  * \brief Template functor for scalar/packet assignment with multiplication
+  *
+  */
+template<typename Scalar> struct mul_assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(mul_assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a *= b; }
+  
+  template<int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
+  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::pmul(internal::ploadt<Packet,Alignment>(a),b)); }
+};
+template<typename Scalar>
+struct functor_traits<mul_assign_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasMul
+  };
+};
+
+/** \internal
+  * \brief Template functor for scalar/packet assignment with diviving
+  *
+  */
+template<typename Scalar> struct div_assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(div_assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a /= b; }
+  
+  template<int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
+  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::pdiv(internal::ploadt<Packet,Alignment>(a),b)); }
+};
+template<typename Scalar>
+struct functor_traits<div_assign_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasMul
+  };
+};
+
+
+/** \internal
+  * \brief Template functor for scalar/packet assignment with swaping
+  *
+  * It works as follow. For a non-vectorized evaluation loop, we have:
+  *   for(i) func(A.coeffRef(i), B.coeff(i));
+  * where B is a SwapWrapper expression. The trick is to make SwapWrapper::coeff behaves like a non-const coeffRef.
+  * Actually, SwapWrapper might not even be needed since even if B is a plain expression, since it has to be writable
+  * B.coeff already returns a const reference to the underlying scalar value.
+  * 
+  * The case of a vectorized loop is more tricky:
+  *   for(i,j) func.assignPacket<A_Align>(&A.coeffRef(i,j), B.packet<B_Align>(i,j));
+  * Here, B must be a SwapWrapper whose packet function actually returns a proxy object holding a Scalar*,
+  * the actual alignment and Packet type.
+  *
+  */
+template<typename Scalar> struct swap_assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const
+  {
+    using std::swap;
+    swap(a,const_cast<Scalar&>(b));
+  }
+  
+  template<int LhsAlignment, int RhsAlignment, typename Packet>
+  EIGEN_STRONG_INLINE void swapPacket(Scalar* a, Scalar* b) const
+  {
+    Packet tmp = internal::ploadt<Packet,RhsAlignment>(b);
+    internal::pstoret<Scalar,Packet,RhsAlignment>(b, internal::ploadt<Packet,LhsAlignment>(a));
+    internal::pstoret<Scalar,Packet,LhsAlignment>(a, tmp);
+  }
+};
+template<typename Scalar>
+struct functor_traits<swap_assign_op<Scalar> > {
+  enum {
+    Cost = 3 * NumTraits<Scalar>::ReadCost,
+    PacketAccess = packet_traits<Scalar>::IsVectorized
+  };
+};
+
+} // namespace internal
+
+} // namespace Eigen
+
+#endif // EIGEN_ASSIGNMENT_FUNCTORS_H
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -0,0 +1,456 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BINARY_FUNCTORS_H
+#define EIGEN_BINARY_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- associative binary functors ----------
+
+/** \internal
+  * \brief Template functor to compute the sum of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::operator+, class VectorwiseOp, DenseBase::sum()
+  */
+template<typename Scalar> struct scalar_sum_op {
+//   typedef Scalar result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a + b; }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::padd(a,b); }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
+  { return internal::predux(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_sum_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasAdd
+  };
+};
+
+/** \internal
+  * \brief Template specialization to deprecate the summation of boolean expressions.
+  * This is required to solve Bug 426.
+  * \sa DenseBase::count(), DenseBase::any(), ArrayBase::cast(), MatrixBase::cast()
+  */
+template<> struct scalar_sum_op<bool> : scalar_sum_op<int> {
+  EIGEN_DEPRECATED
+  scalar_sum_op() {}
+};
+
+
+/** \internal
+  * \brief Template functor to compute the product of two scalars
+  *
+  * \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux()
+  */
+template<typename LhsScalar,typename RhsScalar> struct scalar_product_op {
+  enum {
+    // TODO vectorize mixed product
+    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
+  };
+  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pmul(a,b); }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
+  { return internal::predux_mul(a); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost)/2, // rough estimate!
+    PacketAccess = scalar_product_op<LhsScalar,RhsScalar>::Vectorizable
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the conjugate product of two scalars
+  *
+  * This is a short cut for conj(x) * y which is needed for optimization purpose; in Eigen2 support mode, this becomes x * conj(y)
+  */
+template<typename LhsScalar,typename RhsScalar> struct scalar_conj_product_op {
+
+  enum {
+    Conj = NumTraits<LhsScalar>::IsComplex
+  };
+  
+  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
+  
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
+  { return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
+  
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_conj_product_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = NumTraits<LhsScalar>::MulCost,
+    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMul
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the min of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff()
+  */
+template<typename Scalar> struct scalar_min_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { EIGEN_USING_STD_MATH(min); return (min)(a, b); }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pmin(a,b); }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
+  { return internal::predux_min(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_min_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasMin
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the max of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff()
+  */
+template<typename Scalar> struct scalar_max_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { EIGEN_USING_STD_MATH(max); return (max)(a, b); }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pmax(a,b); }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
+  { return internal::predux_max(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_max_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasMax
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the hypot of two scalars
+  *
+  * \sa MatrixBase::stableNorm(), class Redux
+  */
+template<typename Scalar> struct scalar_hypot_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
+//   typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const
+  {
+    EIGEN_USING_STD_MATH(max);
+    EIGEN_USING_STD_MATH(min);
+    using std::sqrt;
+    Scalar p, qp;
+    if(_x>_y)
+    {
+      p = _x;
+      qp = _y / p;
+    }
+    else
+    {
+      p = _y;
+      qp = _x / p;
+    }
+    return p * sqrt(Scalar(1) + qp*qp);
+  }
+};
+template<typename Scalar>
+struct functor_traits<scalar_hypot_op<Scalar> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess=0 };
+};
+
+/** \internal
+  * \brief Template functor to compute the pow of two scalars
+  */
+template<typename Scalar, typename OtherScalar> struct scalar_binary_pow_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_binary_pow_op)
+  EIGEN_DEVICE_FUNC
+  inline Scalar operator() (const Scalar& a, const OtherScalar& b) const { return numext::pow(a, b); }
+};
+template<typename Scalar, typename OtherScalar>
+struct functor_traits<scalar_binary_pow_op<Scalar,OtherScalar> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+
+
+
+//---------- non associative binary functors ----------
+
+/** \internal
+  * \brief Template functor to compute the difference of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::operator-
+  */
+template<typename Scalar> struct scalar_difference_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a - b; }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::psub(a,b); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_difference_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasSub
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the quotient of two scalars
+  *
+  * \sa class CwiseBinaryOp, Cwise::operator/()
+  */
+template<typename LhsScalar,typename RhsScalar> struct scalar_quotient_op {
+  enum {
+    // TODO vectorize mixed product
+    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasDiv && packet_traits<RhsScalar>::HasDiv
+  };
+  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pdiv(a,b); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost), // rough estimate!
+    PacketAccess = scalar_quotient_op<LhsScalar,RhsScalar>::Vectorizable
+  };
+};
+
+
+
+/** \internal
+  * \brief Template functor to compute the and of two booleans
+  *
+  * \sa class CwiseBinaryOp, ArrayBase::operator&&
+  */
+struct scalar_boolean_and_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; }
+};
+template<> struct functor_traits<scalar_boolean_and_op> {
+  enum {
+    Cost = NumTraits<bool>::AddCost,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the or of two booleans
+  *
+  * \sa class CwiseBinaryOp, ArrayBase::operator||
+  */
+struct scalar_boolean_or_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; }
+};
+template<> struct functor_traits<scalar_boolean_or_op> {
+  enum {
+    Cost = NumTraits<bool>::AddCost,
+    PacketAccess = false
+  };
+};
+
+
+
+//---------- binary functors bound to a constant, thus appearing as a unary functor ----------
+
+/** \internal
+  * \brief Template functor to multiply a scalar by a fixed other one
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::operator*, MatrixBase::operator/
+  */
+/* NOTE why doing the pset1() in packetOp *is* an optimization ?
+ * indeed it seems better to declare m_other as a Packet and do the pset1() once
+ * in the constructor. However, in practice:
+ *  - GCC does not like m_other as a Packet and generate a load every time it needs it
+ *  - on the other hand GCC is able to moves the pset1() outside the loop :)
+ *  - simpler code ;)
+ * (ICC and gcc 4.4 seems to perform well in both cases, the issue is visible with y = a*x + b*y)
+ */
+template<typename Scalar>
+struct scalar_multiple_op {
+  typedef typename packet_traits<Scalar>::type Packet;
+  // FIXME default copy constructors seems bugged with std::complex<>
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE scalar_multiple_op(const scalar_multiple_op& other) : m_other(other.m_other) { }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE scalar_multiple_op(const Scalar& other) : m_other(other) { }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a * m_other; }
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::pmul(a, pset1<Packet>(m_other)); }
+  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
+};
+template<typename Scalar>
+struct functor_traits<scalar_multiple_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
+
+template<typename Scalar1, typename Scalar2>
+struct scalar_multiple2_op {
+  typedef typename scalar_product_traits<Scalar1,Scalar2>::ReturnType result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_multiple2_op(const scalar_multiple2_op& other) : m_other(other.m_other) { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_multiple2_op(const Scalar2& other) : m_other(other) { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar1& a) const { return a * m_other; }
+  typename add_const_on_value_type<typename NumTraits<Scalar2>::Nested>::type m_other;
+};
+template<typename Scalar1,typename Scalar2>
+struct functor_traits<scalar_multiple2_op<Scalar1,Scalar2> >
+{ enum { Cost = NumTraits<Scalar1>::MulCost, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to divide a scalar by a fixed other one
+  *
+  * This functor is used to implement the quotient of a matrix by
+  * a scalar where the scalar type is not necessarily a floating point type.
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::operator/
+  */
+template<typename Scalar>
+struct scalar_quotient1_op {
+  typedef typename packet_traits<Scalar>::type Packet;
+  // FIXME default copy constructors seems bugged with std::complex<>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient1_op(const scalar_quotient1_op& other) : m_other(other.m_other) { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient1_op(const Scalar& other) : m_other(other) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a / m_other; }
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::pdiv(a, pset1<Packet>(m_other)); }
+  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
+};
+template<typename Scalar>
+struct functor_traits<scalar_quotient1_op<Scalar> >
+{ enum { Cost = 2 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
+
+// In Eigen, any binary op (Product, CwiseBinaryOp) require the Lhs and Rhs to have the same scalar type, except for multiplication
+// where the mixing of different types is handled by scalar_product_traits
+// In particular, real * complex<real> is allowed.
+// FIXME move this to functor_traits adding a functor_default
+template<typename Functor> struct functor_is_product_like { enum { ret = 0 }; };
+template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
+template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_conj_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
+template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_quotient_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
+
+
+/** \internal
+  * \brief Template functor to add a scalar to a fixed other one
+  * \sa class CwiseUnaryOp, Array::operator+
+  */
+/* If you wonder why doing the pset1() in packetOp() is an optimization check scalar_multiple_op */
+template<typename Scalar>
+struct scalar_add_op {
+  typedef typename packet_traits<Scalar>::type Packet;
+  // FIXME default copy constructors seems bugged with std::complex<>
+  EIGEN_DEVICE_FUNC inline scalar_add_op(const scalar_add_op& other) : m_other(other.m_other) { }
+  EIGEN_DEVICE_FUNC inline scalar_add_op(const Scalar& other) : m_other(other) { }
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a + m_other; }
+  inline const Packet packetOp(const Packet& a) const
+  { return internal::padd(a, pset1<Packet>(m_other)); }
+  const Scalar m_other;
+};
+template<typename Scalar>
+struct functor_traits<scalar_add_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
+
+/** \internal
+  * \brief Template functor to subtract a fixed scalar to another one
+  * \sa class CwiseUnaryOp, Array::operator-, struct scalar_add_op, struct scalar_rsub_op
+  */
+template<typename Scalar>
+struct scalar_sub_op {
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline scalar_sub_op(const scalar_sub_op& other) : m_other(other.m_other) { }
+  inline scalar_sub_op(const Scalar& other) : m_other(other) { }
+  inline Scalar operator() (const Scalar& a) const { return a - m_other; }
+  inline const Packet packetOp(const Packet& a) const
+  { return internal::psub(a, pset1<Packet>(m_other)); }
+  const Scalar m_other;
+};
+template<typename Scalar>
+struct functor_traits<scalar_sub_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
+
+/** \internal
+  * \brief Template functor to subtract a scalar to fixed another one
+  * \sa class CwiseUnaryOp, Array::operator-, struct scalar_add_op, struct scalar_sub_op
+  */
+template<typename Scalar>
+struct scalar_rsub_op {
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline scalar_rsub_op(const scalar_rsub_op& other) : m_other(other.m_other) { }
+  inline scalar_rsub_op(const Scalar& other) : m_other(other) { }
+  inline Scalar operator() (const Scalar& a) const { return m_other - a; }
+  inline const Packet packetOp(const Packet& a) const
+  { return internal::psub(pset1<Packet>(m_other), a); }
+  const Scalar m_other;
+};
+template<typename Scalar>
+struct functor_traits<scalar_rsub_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
+
+/** \internal
+  * \brief Template functor to raise a scalar to a power
+  * \sa class CwiseUnaryOp, Cwise::pow
+  */
+template<typename Scalar>
+struct scalar_pow_op {
+  // FIXME default copy constructors seems bugged with std::complex<>
+  inline scalar_pow_op(const scalar_pow_op& other) : m_exponent(other.m_exponent) { }
+  inline scalar_pow_op(const Scalar& exponent) : m_exponent(exponent) {}
+  EIGEN_DEVICE_FUNC
+  inline Scalar operator() (const Scalar& a) const { return numext::pow(a, m_exponent); }
+  const Scalar m_exponent;
+};
+template<typename Scalar>
+struct functor_traits<scalar_pow_op<Scalar> >
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to compute the quotient between a scalar and array entries.
+  * \sa class CwiseUnaryOp, Cwise::inverse()
+  */
+template<typename Scalar>
+struct scalar_inverse_mult_op {
+  scalar_inverse_mult_op(const Scalar& other) : m_other(other) {}
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return m_other / a; }
+  template<typename Packet>
+  inline const Packet packetOp(const Packet& a) const
+  { return internal::pdiv(pset1<Packet>(m_other),a); }
+  Scalar m_other;
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_BINARY_FUNCTORS_H
--- a/Eigen/src/Core/functors/CMakeLists.txt
+++ b/Eigen/src/Core/functors/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_Core_Functor_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_Core_Functor_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/functors COMPONENT Devel
+  )
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -0,0 +1,158 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NULLARY_FUNCTORS_H
+#define EIGEN_NULLARY_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename Scalar>
+struct scalar_constant_op {
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const scalar_constant_op& other) : m_other(other.m_other) { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const Scalar& other) : m_other(other) { }
+  template<typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index, Index = 0) const { return m_other; }
+  template<typename Index>
+  EIGEN_STRONG_INLINE const Packet packetOp(Index, Index = 0) const { return internal::pset1<Packet>(m_other); }
+  const Scalar m_other;
+};
+template<typename Scalar>
+struct functor_traits<scalar_constant_op<Scalar> >
+// FIXME replace this packet test by a safe one
+{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::Vectorizable, IsRepeatable = true }; };
+
+template<typename Scalar> struct scalar_identity_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_identity_op)
+  template<typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const { return row==col ? Scalar(1) : Scalar(0); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_identity_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };
+
+template <typename Scalar, bool RandomAccess> struct linspaced_op_impl;
+
+// linear access for packet ops:
+// 1) initialization
+//   base = [low, ..., low] + ([step, ..., step] * [-size, ..., 0])
+// 2) each step (where size is 1 for coeff access or PacketSize for packet access)
+//   base += [size*step, ..., size*step]
+//
+// TODO: Perhaps it's better to initialize lazily (so not in the constructor but in packetOp)
+//       in order to avoid the padd() in operator() ?
+template <typename Scalar>
+struct linspaced_op_impl<Scalar,false>
+{
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  linspaced_op_impl(const Scalar& low, const Scalar& step) :
+  m_low(low), m_step(step),
+  m_packetStep(pset1<Packet>(packet_traits<Scalar>::size*step)),
+  m_base(padd(pset1<Packet>(low), pmul(pset1<Packet>(step),plset<Scalar>(-packet_traits<Scalar>::size)))) {}
+
+  template<typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const 
+  { 
+    m_base = padd(m_base, pset1<Packet>(m_step));
+    return m_low+Scalar(i)*m_step; 
+  }
+
+  template<typename Index>
+  EIGEN_STRONG_INLINE const Packet packetOp(Index) const { return m_base = padd(m_base,m_packetStep); }
+
+  const Scalar m_low;
+  const Scalar m_step;
+  const Packet m_packetStep;
+  mutable Packet m_base;
+};
+
+// random access for packet ops:
+// 1) each step
+//   [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
+template <typename Scalar>
+struct linspaced_op_impl<Scalar,true>
+{
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  linspaced_op_impl(const Scalar& low, const Scalar& step) :
+  m_low(low), m_step(step),
+  m_lowPacket(pset1<Packet>(m_low)), m_stepPacket(pset1<Packet>(m_step)), m_interPacket(plset<Scalar>(0)) {}
+
+  template<typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; }
+
+  template<typename Index>
+  EIGEN_STRONG_INLINE const Packet packetOp(Index i) const
+  { return internal::padd(m_lowPacket, pmul(m_stepPacket, padd(pset1<Packet>(Scalar(i)),m_interPacket))); }
+
+  const Scalar m_low;
+  const Scalar m_step;
+  const Packet m_lowPacket;
+  const Packet m_stepPacket;
+  const Packet m_interPacket;
+};
+
+// ----- Linspace functor ----------------------------------------------------------------
+
+// Forward declaration (we default to random access which does not really give
+// us a speed gain when using packet access but it allows to use the functor in
+// nested expressions).
+template <typename Scalar, bool RandomAccess = true> struct linspaced_op;
+template <typename Scalar, bool RandomAccess> struct functor_traits< linspaced_op<Scalar,RandomAccess> >
+{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::HasSetLinear, IsRepeatable = true }; };
+template <typename Scalar, bool RandomAccess> struct linspaced_op
+{
+  typedef typename packet_traits<Scalar>::type Packet;
+  linspaced_op(const Scalar& low, const Scalar& high, DenseIndex num_steps) : impl((num_steps==1 ? high : low), (num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1))) {}
+
+  template<typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return impl(i); }
+
+  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
+  // there row==0 and col is used for the actual iteration.
+  template<typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const 
+  {
+    eigen_assert(col==0 || row==0);
+    return impl(col + row);
+  }
+
+  template<typename Index>
+  EIGEN_STRONG_INLINE const Packet packetOp(Index i) const { return impl.packetOp(i); }
+
+  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
+  // there row==0 and col is used for the actual iteration.
+  template<typename Index>
+  EIGEN_STRONG_INLINE const Packet packetOp(Index row, Index col) const
+  {
+    eigen_assert(col==0 || row==0);
+    return impl.packetOp(col + row);
+  }
+
+  // This proxy object handles the actual required temporaries, the different
+  // implementations (random vs. sequential access) as well as the
+  // correct piping to size 2/4 packet operations.
+  const linspaced_op_impl<Scalar,RandomAccess> impl;
+};
+
+// all functors allow linear access, except scalar_identity_op. So we fix here a quick meta
+// to indicate whether a functor allows linear access, just always answering 'yes' except for
+// scalar_identity_op.
+// FIXME move this to functor_traits adding a functor_default
+template<typename Functor> struct functor_has_linear_access { enum { ret = 1 }; };
+template<typename Scalar> struct functor_has_linear_access<scalar_identity_op<Scalar> > { enum { ret = 0 }; };
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_NULLARY_FUNCTORS_H
--- a/Eigen/src/Core/functors/StlFunctors.h
+++ b/Eigen/src/Core/functors/StlFunctors.h
@@ -0,0 +1,129 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STL_FUNCTORS_H
+#define EIGEN_STL_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// default functor traits for STL functors:
+
+template<typename T>
+struct functor_traits<std::multiplies<T> >
+{ enum { Cost = NumTraits<T>::MulCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::divides<T> >
+{ enum { Cost = NumTraits<T>::MulCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::plus<T> >
+{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::minus<T> >
+{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::negate<T> >
+{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::logical_or<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::logical_and<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::logical_not<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::greater<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::less<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::greater_equal<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::less_equal<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::equal_to<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::not_equal_to<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::binder2nd<T> >
+{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::binder1st<T> >
+{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::unary_negate<T> >
+{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::binary_negate<T> >
+{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
+
+#ifdef EIGEN_STDEXT_SUPPORT
+
+template<typename T0,typename T1>
+struct functor_traits<std::project1st<T0,T1> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+template<typename T0,typename T1>
+struct functor_traits<std::project2nd<T0,T1> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+template<typename T0,typename T1>
+struct functor_traits<std::select2nd<std::pair<T0,T1> > >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+template<typename T0,typename T1>
+struct functor_traits<std::select1st<std::pair<T0,T1> > >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+template<typename T0,typename T1>
+struct functor_traits<std::unary_compose<T0,T1> >
+{ enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost, PacketAccess = false }; };
+
+template<typename T0,typename T1,typename T2>
+struct functor_traits<std::binary_compose<T0,T1,T2> >
+{ enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost + functor_traits<T2>::Cost, PacketAccess = false }; };
+
+#endif // EIGEN_STDEXT_SUPPORT
+
+// allow to add new functors and specializations of functor_traits from outside Eigen.
+// this macro is really needed because functor_traits must be specialized after it is declared but before it is used...
+#ifdef EIGEN_FUNCTORS_PLUGIN
+#include EIGEN_FUNCTORS_PLUGIN
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_STL_FUNCTORS_H
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -0,0 +1,396 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_UNARY_FUNCTORS_H
+#define EIGEN_UNARY_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+  * \brief Template functor to compute the opposite of a scalar
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::operator-
+  */
+template<typename Scalar> struct scalar_opposite_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_opposite_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return -a; }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::pnegate(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_opposite_op<Scalar> >
+{ enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasNegate };
+};
+
+/** \internal
+  * \brief Template functor to compute the absolute value of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::abs
+  */
+template<typename Scalar> struct scalar_abs_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_abs_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using std::abs; return abs(a); }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::pabs(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_abs_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasAbs
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the squared absolute value of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::abs2
+  */
+template<typename Scalar> struct scalar_abs2_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_abs2_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::abs2(a); }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::pmul(a,a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_abs2_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasAbs2 }; };
+
+/** \internal
+  * \brief Template functor to compute the conjugate of a complex value
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::conjugate()
+  */
+template<typename Scalar> struct scalar_conjugate_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_conjugate_op)
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::conj; return conj(a); }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pconj(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_conjugate_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::IsComplex ? NumTraits<Scalar>::AddCost : 0,
+    PacketAccess = packet_traits<Scalar>::HasConj
+  };
+};
+
+/** \internal
+  * \brief Template functor to cast a scalar to another type
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::cast()
+  */
+template<typename Scalar, typename NewType>
+struct scalar_cast_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef NewType result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const NewType operator() (const Scalar& a) const { return cast<Scalar, NewType>(a); }
+};
+template<typename Scalar, typename NewType>
+struct functor_traits<scalar_cast_op<Scalar,NewType> >
+{ enum { Cost = is_same<Scalar, NewType>::value ? 0 : NumTraits<NewType>::AddCost, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to extract the real part of a complex
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::real()
+  */
+template<typename Scalar>
+struct scalar_real_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_real_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::real(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_real_op<Scalar> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to extract the imaginary part of a complex
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::imag()
+  */
+template<typename Scalar>
+struct scalar_imag_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::imag(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_imag_op<Scalar> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to extract the real part of a complex as a reference
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::real()
+  */
+template<typename Scalar>
+struct scalar_real_ref_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_real_ref_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::real_ref(*const_cast<Scalar*>(&a)); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_real_ref_op<Scalar> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to extract the imaginary part of a complex as a reference
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::imag()
+  */
+template<typename Scalar>
+struct scalar_imag_ref_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_ref_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::imag_ref(*const_cast<Scalar*>(&a)); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_imag_ref_op<Scalar> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+/** \internal
+  *
+  * \brief Template functor to compute the exponential of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::exp()
+  */
+template<typename Scalar> struct scalar_exp_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_exp_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::exp; return exp(a); }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::pexp(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_exp_op<Scalar> >
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasExp }; };
+
+/** \internal
+  *
+  * \brief Template functor to compute the logarithm of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::log()
+  */
+template<typename Scalar> struct scalar_log_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_log_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::log; return log(a); }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::plog(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_log_op<Scalar> >
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog }; };
+
+
+/** \internal
+  * \brief Template functor to compute the square root of a scalar
+  * \sa class CwiseUnaryOp, Cwise::sqrt()
+  */
+template<typename Scalar> struct scalar_sqrt_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sqrt; return sqrt(a); }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_sqrt_op<Scalar> >
+{ enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasSqrt
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the cosine of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::cos()
+  */
+template<typename Scalar> struct scalar_cos_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cos_op)
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { using std::cos; return cos(a); }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::pcos(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_cos_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasCos
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the sine of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::sin()
+  */
+template<typename Scalar> struct scalar_sin_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sin_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sin; return sin(a); }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::psin(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_sin_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasSin
+  };
+};
+
+
+/** \internal
+  * \brief Template functor to compute the tan of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::tan()
+  */
+template<typename Scalar> struct scalar_tan_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_tan_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::tan; return tan(a); }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::ptan(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_tan_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasTan
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the arc cosine of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::acos()
+  */
+template<typename Scalar> struct scalar_acos_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_acos_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::acos; return acos(a); }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::pacos(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_acos_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasACos
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the arc sine of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::asin()
+  */
+template<typename Scalar> struct scalar_asin_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_asin_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::asin; return asin(a); }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::pasin(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_asin_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasASin
+  };
+};
+
+
+/** \internal
+  * \brief Template functor to compute the atan of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::atan()
+  */
+template<typename Scalar> struct scalar_atan_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_atan_op)
+  inline const Scalar operator() (const Scalar& a) const { using std::atan; return atan(a); }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::patan(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_atan_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasATan
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the inverse of a scalar
+  * \sa class CwiseUnaryOp, Cwise::inverse()
+  */
+template<typename Scalar>
+struct scalar_inverse_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_inverse_op)
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return Scalar(1)/a; }
+  template<typename Packet>
+  inline const Packet packetOp(const Packet& a) const
+  { return internal::pdiv(pset1<Packet>(Scalar(1)),a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_inverse_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
+
+/** \internal
+  * \brief Template functor to compute the square of a scalar
+  * \sa class CwiseUnaryOp, Cwise::square()
+  */
+template<typename Scalar>
+struct scalar_square_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_square_op)
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a*a; }
+  template<typename Packet>
+  inline const Packet packetOp(const Packet& a) const
+  { return internal::pmul(a,a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_square_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
+
+/** \internal
+  * \brief Template functor to compute the cube of a scalar
+  * \sa class CwiseUnaryOp, Cwise::cube()
+  */
+template<typename Scalar>
+struct scalar_cube_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cube_op)
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a*a*a; }
+  template<typename Packet>
+  inline const Packet packetOp(const Packet& a) const
+  { return internal::pmul(a,pmul(a,a)); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_cube_op<Scalar> >
+{ enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_FUNCTORS_H
--- a/Eigen/src/Core/products/CoeffBasedProduct.h
+++ b/Eigen/src/Core/products/CoeffBasedProduct.h
@@ -85,7 +85,8 @@ struct traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> >
      Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
            | (EvalToRowMajor ? RowMajorBit : 0)
            | NestingFlags
-            | (LhsFlags & RhsFlags & AlignedBit)
+            | (CanVectorizeLhs ? (LhsFlags & AlignedBit) : 0)
+            | (CanVectorizeRhs ? (RhsFlags & AlignedBit) : 0)
            // TODO enable vectorization for mixed types
            | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0),

@@ -140,11 +141,13 @@ class CoeffBasedProduct

  public:

+    EIGEN_DEVICE_FUNC
    inline CoeffBasedProduct(const CoeffBasedProduct& other)
      : Base(), m_lhs(other.m_lhs), m_rhs(other.m_rhs)
    {}

    template<typename Lhs, typename Rhs>
+    EIGEN_DEVICE_FUNC 
    inline CoeffBasedProduct(const Lhs& lhs, const Rhs& rhs)
      : m_lhs(lhs), m_rhs(rhs)
    {
@@ -157,9 +160,10 @@ class CoeffBasedProduct
        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
    }

-    EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }

+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
    {
      Scalar res;
@@ -170,6 +174,7 @@ class CoeffBasedProduct
    /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
     * which is why we don't set the LinearAccessBit.
     */
+    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
    {
      Scalar res;
@@ -191,22 +196,26 @@ class CoeffBasedProduct
    }

    // Implicit conversion to the nested type (trigger the evaluation of the product)
+    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE operator const PlainObject& () const
    {
      m_result.lazyAssign(*this);
      return m_result;
    }

-    const _LhsNested& lhs() const { return m_lhs; }
-    const _RhsNested& rhs() const { return m_rhs; }
+    EIGEN_DEVICE_FUNC const _LhsNested& lhs() const { return m_lhs; }
+    EIGEN_DEVICE_FUNC const _RhsNested& rhs() const { return m_rhs; }

+    EIGEN_DEVICE_FUNC
    const Diagonal<const LazyCoeffBasedProductType,0> diagonal() const
    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }

    template<int DiagonalIndex>
+    EIGEN_DEVICE_FUNC 
    const Diagonal<const LazyCoeffBasedProductType,DiagonalIndex> diagonal() const
    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }

+    EIGEN_DEVICE_FUNC
    const Diagonal<const LazyCoeffBasedProductType,Dynamic> diagonal(Index index) const
    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this).diagonal(index); }

@@ -239,6 +248,7 @@ template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
 struct product_coeff_impl<DefaultTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
 {
  typedef typename Lhs::Index Index;
+  EIGEN_DEVICE_FUNC 
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
  {
    product_coeff_impl<DefaultTraversal, UnrollingIndex-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, res);
@@ -250,6 +260,7 @@ template<typename Lhs, typename Rhs, typename RetScalar>
 struct product_coeff_impl<DefaultTraversal, 0, Lhs, Rhs, RetScalar>
 {
  typedef typename Lhs::Index Index;
+  EIGEN_DEVICE_FUNC 
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
  {
    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
@@ -260,6 +271,7 @@ template<typename Lhs, typename Rhs, typename RetScalar>
 struct product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar>
 {
  typedef typename Lhs::Index Index;
+  EIGEN_DEVICE_FUNC 
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar& res)
  {
    eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
@@ -305,7 +317,6 @@ struct product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, Re
  {
    Packet pres;
    product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
-    product_coeff_impl<DefaultTraversal,UnrollingIndex,Lhs,Rhs,RetScalar>::run(row, col, lhs, rhs, res);
    res = predux(pres);
  }
 };
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -23,6 +23,8 @@ template<
  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
 struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
 {
+  typedef gebp_traits<RhsScalar,LhsScalar> Traits;
+  
  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
  static EIGEN_STRONG_INLINE void run(
    Index rows, Index cols, Index depth,
@@ -51,6 +53,8 @@ template<
 struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
 {

+typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+  
 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 static void run(Index rows, Index cols, Index depth,
  const LhsScalar* _lhs, Index lhsStride,
@@ -63,11 +67,9 @@ static void run(Index rows, Index cols, Index depth,
  const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
  const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);

-  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
-
  Index kc = blocking.kc();                   // cache block size along the K direction
  Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
-  //Index nc = blocking.nc(); // cache block size along the N direction
+  Index nc = (std::min)(cols,blocking.nc());  // cache block size along the N direction

  gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
  gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
@@ -80,68 +82,68 @@ static void run(Index rows, Index cols, Index depth,
    Index tid = omp_get_thread_num();
    Index threads = omp_get_num_threads();
    
-    std::size_t sizeA = kc*mc;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0);
-    ei_declare_aligned_stack_constructed_variable(RhsScalar, w, sizeW, 0);
+    LhsScalar* blockA = blocking.blockA();
+    eigen_internal_assert(blockA!=0);
    
-    RhsScalar* blockB = blocking.blockB();
-    eigen_internal_assert(blockB!=0);
-
+    std::size_t sizeB = kc*nc;
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0);
+      
    // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
    for(Index k=0; k<depth; k+=kc)
    {
      const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A'

      // In order to reduce the chance that a thread has to wait for the other,
-      // let's start by packing A'.
-      pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc);
+      // let's start by packing B'.
+      pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc);

-      // Pack B_k to B' in a parallel fashion:
-      // each thread packs the sub block B_k,j to B'_j where j is the thread id.
+      // Pack A_k to A' in a parallel fashion:
+      // each thread packs the sub block A_k,i to A'_i where i is the thread id.

-      // However, before copying to B'_j, we have to make sure that no other thread is still using it,
+      // However, before copying to A'_i, we have to make sure that no other thread is still using it,
      // i.e., we test that info[tid].users equals 0.
      // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
      while(info[tid].users!=0) {}
      info[tid].users += threads;
+      
+      pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length);

-      pack_rhs(blockB+info[tid].rhs_start*actual_kc, &rhs(k,info[tid].rhs_start), rhsStride, actual_kc, info[tid].rhs_length);
-
-      // Notify the other threads that the part B'_j is ready to go.
+      // Notify the other threads that the part A'_i is ready to go.
      info[tid].sync = k;
-
-      // Computes C_i += A' * B' per B'_j
+      
+      // Computes C_i += A' * B' per A'_i
      for(Index shift=0; shift<threads; ++shift)
      {
-        Index j = (tid+shift)%threads;
+        Index i = (tid+shift)%threads;

-        // At this point we have to make sure that B'_j has been updated by the thread j,
+        // At this point we have to make sure that A'_i has been updated by the thread i,
        // we use testAndSetOrdered to mimic a volatile access.
        // However, no need to wait for the B' part which has been updated by the current thread!
        if(shift>0)
-          while(info[j].sync!=k) {}
-
-        gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w);
+          while(info[i].sync!=k) {}
+        gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
      }

-      // Then keep going as usual with the remaining A'
-      for(Index i=mc; i<rows; i+=mc)
+      // Then keep going as usual with the remaining B'
+      for(Index j=nc; j<cols; j+=nc)
      {
-        const Index actual_mc = (std::min)(i+mc,rows)-i;
+        const Index actual_nc = (std::min)(j+nc,cols)-j;

-        // pack A_i,k to A'
-        pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc);
+        // pack B_k,j to B'
+        pack_rhs(blockB, &rhs(k,j), rhsStride, actual_kc, actual_nc);

-        // C_i += A' * B'
-        gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0, w);
+        // C_j += A' * B'
+        gebp(res+j*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
      }

-      // Release all the sub blocks B'_j of B' for the current thread,
+      // Release all the sub blocks A'_i of A' for the current thread,
      // i.e., we simply decrement the number of users by 1
-      for(Index j=0; j<threads; ++j)
+      #pragma omp critical
+      {
+      for(Index i=0; i<threads; ++i)
        #pragma omp atomic
-        --(info[j].users);
+        --(info[i].users);
+      }
    }
  }
  else
@@ -151,38 +153,34 @@ static void run(Index rows, Index cols, Index depth,

    // this is the sequential version!
    std::size_t sizeA = kc*mc;
-    std::size_t sizeB = kc*cols;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+    std::size_t sizeB = kc*nc;

    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
-    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockW, sizeW, blocking.blockW());

    // For each horizontal panel of the rhs, and corresponding panel of the lhs...
-    // (==GEMM_VAR1)
    for(Index k2=0; k2<depth; k2+=kc)
    {
      const Index actual_kc = (std::min)(k2+kc,depth)-k2;

      // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
-      // => Pack rhs's panel into a sequential chunk of memory (L2 caching)
-      // Note that this panel will be read as many times as the number of blocks in the lhs's
-      // vertical panel which is, in practice, a very low number.
-      pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
+      // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
+      // Note that this panel will be read as many times as the number of blocks in the rhs's
+      // horizontal panel which is, in practice, a very low number.
+      pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows);

-      // For each mc x kc block of the lhs's vertical panel...
-      // (==GEPP_VAR1)
-      for(Index i2=0; i2<rows; i2+=mc)
+      // For each kc x nc block of the rhs's horizontal panel...
+      for(Index j2=0; j2<cols; j2+=nc)
      {
-        const Index actual_mc = (std::min)(i2+mc,rows)-i2;
+        const Index actual_nc = (std::min)(j2+nc,cols)-j2;

-        // We pack the lhs's block into a sequential chunk of memory (L1 caching)
+        // We pack the rhs's block into a sequential chunk of memory (L2 caching)
        // Note that this block will be read a very high number of times, which is equal to the number of
-        // micro vertical panel of the large rhs's panel (e.g., cols/4 times).
-        pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
+        // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
+        pack_rhs(blockB, &rhs(k2,j2), rhsStride, actual_kc, actual_nc);

-        // Everything is packed, we can now call the block * panel kernel:
-        gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
+        // Everything is packed, we can now call the panel * block kernel:
+        gebp(res+j2*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
      }
    }
  }
@@ -203,14 +201,13 @@ struct traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
 template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType>
 struct gemm_functor
 {
-  gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha,
-                  BlockingType& blocking)
+  gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, BlockingType& blocking)
    : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking)
  {}

  void initParallelSession() const
  {
-    m_blocking.allocateB();
+    m_blocking.allocateA();
  }

  void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const
@@ -224,6 +221,8 @@ struct gemm_functor
              (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
              m_actualAlpha, m_blocking, info);
  }
+  
+  typedef typename Gemm::Traits Traits;

  protected:
    const Lhs& m_lhs;
@@ -245,7 +244,6 @@ class level3_blocking
  protected:
    LhsScalar* m_blockA;
    RhsScalar* m_blockB;
-    RhsScalar* m_blockW;

    DenseIndex m_mc;
    DenseIndex m_nc;
@@ -254,7 +252,7 @@ class level3_blocking
  public:

    level3_blocking()
-      : m_blockA(0), m_blockB(0), m_blockW(0), m_mc(0), m_nc(0), m_kc(0)
+      : m_blockA(0), m_blockB(0), m_mc(0), m_nc(0), m_kc(0)
    {}

    inline DenseIndex mc() const { return m_mc; }
@@ -263,7 +261,6 @@ class level3_blocking

    inline LhsScalar* blockA() { return m_blockA; }
    inline RhsScalar* blockB() { return m_blockB; }
-    inline RhsScalar* blockW() { return m_blockW; }
 };

 template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
@@ -282,29 +279,25 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
    enum {
      SizeA = ActualRows * MaxDepth,
-      SizeB = ActualCols * MaxDepth,
-      SizeW = MaxDepth * Traits::WorkSpaceFactor
+      SizeB = ActualCols * MaxDepth
    };

-    EIGEN_ALIGN16 LhsScalar m_staticA[SizeA];
-    EIGEN_ALIGN16 RhsScalar m_staticB[SizeB];
-    EIGEN_ALIGN16 RhsScalar m_staticW[SizeW];
+    EIGEN_ALIGN_DEFAULT LhsScalar m_staticA[SizeA];
+    EIGEN_ALIGN_DEFAULT RhsScalar m_staticB[SizeB];

  public:

-    gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/)
+    gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, bool /*full_rows*/ = false)
    {
      this->m_mc = ActualRows;
      this->m_nc = ActualCols;
      this->m_kc = MaxDepth;
      this->m_blockA = m_staticA;
      this->m_blockB = m_staticB;
-      this->m_blockW = m_staticW;
    }

    inline void allocateA() {}
    inline void allocateB() {}
-    inline void allocateW() {}
    inline void allocateAll() {}
 };

@@ -323,20 +316,28 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M

    DenseIndex m_sizeA;
    DenseIndex m_sizeB;
-    DenseIndex m_sizeW;

  public:

-    gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth)
+    gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, bool full_rows = false)
    {
      this->m_mc = Transpose ? cols : rows;
      this->m_nc = Transpose ? rows : cols;
      this->m_kc = depth;

-      computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc);
+      if(full_rows)
+      {
+        DenseIndex m = this->m_mc;
+        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc);
+      }
+      else // full columns
+      {
+        DenseIndex n = this->m_nc;
+        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n);
+      }
+      
      m_sizeA = this->m_mc * this->m_kc;
      m_sizeB = this->m_kc * this->m_nc;
-      m_sizeW = this->m_kc*Traits::WorkSpaceFactor;
    }

    void allocateA()
@@ -351,24 +352,16 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
        this->m_blockB = aligned_new<RhsScalar>(m_sizeB);
    }

-    void allocateW()
-    {
-      if(this->m_blockW==0)
-        this->m_blockW = aligned_new<RhsScalar>(m_sizeW);
-    }
-
    void allocateAll()
    {
      allocateA();
      allocateB();
-      allocateW();
    }

    ~gemm_blocking_space()
    {
      aligned_delete(this->m_blockA, m_sizeA);
      aligned_delete(this->m_blockB, m_sizeB);
-      aligned_delete(this->m_blockW, m_sizeW);
    }
 };

@@ -393,7 +386,37 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
      typedef internal::scalar_product_op<LhsScalar,RhsScalar> BinOp;
      EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar);
    }
+    
+    template<typename Dest>
+    inline void evalTo(Dest& dst) const
+    {
+      if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
+        dst.noalias() = m_lhs .lazyProduct( m_rhs );
+      else
+      {
+        dst.setZero();
+        scaleAndAddTo(dst,Scalar(1));
+      }
+    }

+    template<typename Dest>
+    inline void addTo(Dest& dst) const
+    {
+      if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
+        dst.noalias() += m_lhs .lazyProduct( m_rhs );
+      else
+        scaleAndAddTo(dst,Scalar(1));
+    }
+
+    template<typename Dest>
+    inline void subTo(Dest& dst) const
+    {
+      if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
+        dst.noalias() -= m_lhs .lazyProduct( m_rhs );
+      else
+        scaleAndAddTo(dst,Scalar(-1));
+    }
+    
    template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
    {
      eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
@@ -416,7 +439,7 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
          (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
        _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor;

-      BlockingType blocking(dst.rows(), dst.cols(), lhs.cols());
+      BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true);

      internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit);
    }
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -73,11 +73,8 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
    if(mc > Traits::nr)
      mc = (mc/Traits::nr)*Traits::nr;

-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-    std::size_t sizeB = sizeW + kc*size;
    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
-    ei_declare_aligned_stack_constructed_variable(RhsScalar, allocatedBlockB, sizeB, 0);
-    RhsScalar* blockB = allocatedBlockB + sizeW;
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0);
    
    gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
    gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
@@ -103,15 +100,15 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
        //  3 - after the diagonal => processed with gebp or skipped
        if (UpLo==Lower)
          gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha,
-               -1, -1, 0, 0, allocatedBlockB);
+               -1, -1, 0, 0);

-        sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB);
+        sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);

        if (UpLo==Upper)
        {
          Index j2 = i2+actual_mc;
          gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha,
-               -1, -1, 0, 0, allocatedBlockB);
+               -1, -1, 0, 0);
        }
      }
    }
@@ -136,7 +133,7 @@ struct tribb_kernel
  enum {
    BlockSize  = EIGEN_PLAIN_ENUM_MAX(mr,nr)
  };
-  void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha, RhsScalar* workspace)
+  void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
  {
    gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
    Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
@@ -150,7 +147,7 @@ struct tribb_kernel

      if(UpLo==Upper)
        gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0, workspace);
+                    -1, -1, 0, 0);

      // selfadjoint micro block
      {
@@ -158,7 +155,7 @@ struct tribb_kernel
        buffer.setZero();
        // 1 - apply the kernel on the temporary buffer
        gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0, workspace);
+                    -1, -1, 0, 0);
        // 2 - triangular accumulation
        for(Index j1=0; j1<actualBlockSize; ++j1)
        {
@@ -173,7 +170,7 @@ struct tribb_kernel
      {
        Index i = j+actualBlockSize;
        gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0, workspace);
+                    -1, -1, 0, 0);
      }
    }
  }
@@ -268,6 +265,8 @@ template<typename MatrixType, unsigned int UpLo>
 template<typename ProductDerived, typename _Lhs, typename _Rhs>
 TriangularView<MatrixType,UpLo>& TriangularView<MatrixType,UpLo>::assignProduct(const ProductBase<ProductDerived, _Lhs,_Rhs>& prod, const Scalar& alpha)
 {
+  eigen_assert(m_matrix.rows() == prod.rows() && m_matrix.cols() == prod.cols());
+
  general_product_to_triangular_selector<MatrixType, ProductDerived, UpLo, (_Lhs::ColsAtCompileTime==1) || (_Rhs::RowsAtCompileTime==1)>::run(m_matrix.const_cast_derived(), prod.derived(), alpha);
  
  return *this;
--- a/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
@@ -53,6 +53,8 @@ template< \
  int RhsStorageOrder, bool ConjugateRhs> \
 struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \
 { \
+typedef gebp_traits<EIGTYPE,EIGTYPE> Traits; \
+\
 static void run(Index rows, Index cols, Index depth, \
  const EIGTYPE* _lhs, Index lhsStride, \
  const EIGTYPE* _rhs, Index rhsStride, \
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -26,6 +26,34 @@ namespace internal {
 *  |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
 *  |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
 *  |cplx |real |real | optimal case, vectorization possible via real-cplx mul
+ *
+ * Accesses to the matrix coefficients follow the following logic:
+ *
+ * - if all columns have the same alignment then
+ *   - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case)
+ *   - otherwise perform unaligned loads only (-> NoneAligned case)
+ * - otherwise
+ *   - if even columns have the same alignment then
+ *     // odd columns are guaranteed to have the same alignment too
+ *     - if even or odd columns have the same alignment as the result, then
+ *       // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double)
+ *       - perform half aligned and half unaligned loads (-> EvenAligned case)
+ *     - otherwise perform unaligned loads only (-> NoneAligned case)
+ *   - otherwise, if the register size is 4 scalars (e.g., SSE with float) then
+ *     - one over 4 consecutive columns is guaranteed to be aligned with the result vector,
+ *       perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case)
+ *       // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h
+ *   - otherwise,
+ *     // if we get here, this means the register size is greater than 4 (e.g., AVX with floats),
+ *     // we currently fall back to the NoneAligned case
+ *
+ * The same reasoning apply for the transposed case.
+ * 
+ * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
+ * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
+ * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
+ * compared to unaligned loads on a 4 byte boundary.
+ *
 */
 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
 struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
@@ -52,11 +80,8 @@ EIGEN_DONT_INLINE static void run(
  Index rows, Index cols,
  const LhsScalar* lhs, Index lhsStride,
  const RhsScalar* rhs, Index rhsIncr,
-  ResScalar* res, Index
-  #ifdef EIGEN_INTERNAL_DEBUGGING
-    resIncr
-  #endif
-  , RhsScalar alpha);
+        ResScalar* res, Index resIncr,
+  RhsScalar alpha);
 };

 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
@@ -64,12 +89,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
  Index rows, Index cols,
  const LhsScalar* lhs, Index lhsStride,
  const RhsScalar* rhs, Index rhsIncr,
-  ResScalar* res, Index
-  #ifdef EIGEN_INTERNAL_DEBUGGING
-    resIncr
-  #endif
-  , RhsScalar alpha)
+        ResScalar* res, Index resIncr,
+  RhsScalar alpha)
 {
+  EIGEN_UNUSED_VARIABLE(resIncr);
  eigen_internal_assert(resIncr==1);
  #ifdef _EIGEN_ACCUMULATE_PACKETS
  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
@@ -118,6 +141,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
    alignedSize = 0;
    alignedStart = 0;
  }
+  else if(LhsPacketSize > 4)
+  {
+    // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
+    // Currently, it seems to be better to perform unaligned loads anyway
+    alignmentPattern = NoneAligned;
+  }
  else if (LhsPacketSize>1)
  {
    eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
@@ -265,7 +294,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
        // process aligned result's coeffs
        if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
          for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
-            pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
+            pstore(&res[i], pcj.pmadd(pload<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
        else
          for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
            pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
@@ -322,7 +351,7 @@ EIGEN_DONT_INLINE static void run(
  Index rows, Index cols,
  const LhsScalar* lhs, Index lhsStride,
  const RhsScalar* rhs, Index rhsIncr,
-  ResScalar* res, Index resIncr,
+        ResScalar* res, Index resIncr,
  ResScalar alpha);
 };

@@ -336,6 +365,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
 {
  EIGEN_UNUSED_VARIABLE(rhsIncr);
  eigen_internal_assert(rhsIncr==1);
+  
  #ifdef _EIGEN_ACCUMULATE_PACKETS
  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
  #endif
@@ -381,6 +411,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
    alignedSize = 0;
    alignedStart = 0;
  }
+  else if(LhsPacketSize > 4)
+  {
+    // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
+    alignmentPattern = NoneAligned;
+  }
  else if (LhsPacketSize>1)
  {
    eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
@@ -418,7 +453,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
  Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
  for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
  {
-    EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
+    EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
    ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);

    // this helps the compiler generating good binary code
@@ -527,7 +562,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
  {
    for (Index i=start; i<end; ++i)
    {
-      EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
+      EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
      ResPacket ptmp0 = pset1<ResPacket>(tmp0);
      const LhsScalar* lhs0 = lhs + i*lhsStride;
      // process first unaligned result's coeffs
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -73,13 +73,13 @@ namespace internal {

 template<typename Index> struct GemmParallelInfo
 {
-  GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0) {}
+  GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}

  int volatile sync;
  int volatile users;

-  Index rhs_start;
-  Index rhs_length;
+  Index lhs_start;
+  Index lhs_length;
 };

 template<bool Condition, typename Functor, typename Index>
@@ -107,7 +107,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
  if((!Condition) || (omp_get_num_threads()>1))
    return func(0,rows, 0,cols);

-  Index size = transpose ? cols : rows;
+  Index size = transpose ? rows : cols;

  // 2- compute the maximal number of threads from the size of the product:
  // FIXME this has to be fine tuned
@@ -126,26 +126,25 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
    std::swap(rows,cols);

  Index blockCols = (cols / threads) & ~Index(0x3);
-  Index blockRows = (rows / threads) & ~Index(0x7);
+  Index blockRows = (rows / threads);
+  blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
  
  GemmParallelInfo<Index>* info = new GemmParallelInfo<Index>[threads];

-  #pragma omp parallel for schedule(static,1) num_threads(threads)
-  for(Index i=0; i<threads; ++i)
+  #pragma omp parallel num_threads(threads)
  {
+    Index i = omp_get_thread_num();
    Index r0 = i*blockRows;
    Index actualBlockRows = (i+1==threads) ? rows-r0 : blockRows;

    Index c0 = i*blockCols;
    Index actualBlockCols = (i+1==threads) ? cols-c0 : blockCols;

-    info[i].rhs_start = c0;
-    info[i].rhs_length = actualBlockCols;
+    info[i].lhs_start = r0;
+    info[i].lhs_length = actualBlockRows;

-    if(transpose)
-      func(0, cols, r0, actualBlockRows, info);
-    else
-      func(r0, actualBlockRows, 0,cols, info);
+    if(transpose) func(c0, actualBlockCols, 0, rows, info);
+    else          func(0, rows, c0, actualBlockCols, info);
  }

  delete[] info;
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -15,7 +15,7 @@ namespace Eigen {
 namespace internal {

 // pack a selfadjoint block diagonal for use with the gebp_kernel
-template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder>
+template<typename Scalar, typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
 struct symm_pack_lhs
 {
  template<int BlockRows> inline
@@ -45,25 +45,32 @@ struct symm_pack_lhs
  }
  void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
  {
+    enum { PacketSize = packet_traits<Scalar>::size };
    const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
    Index count = 0;
-    Index peeled_mc = (rows/Pack1)*Pack1;
-    for(Index i=0; i<peeled_mc; i+=Pack1)
-    {
-      pack<Pack1>(blockA, lhs, cols, i, count);
-    }
-
-    if(rows-peeled_mc>=Pack2)
-    {
-      pack<Pack2>(blockA, lhs, cols, peeled_mc, count);
-      peeled_mc += Pack2;
-    }
+    //Index peeled_mc3 = (rows/Pack1)*Pack1;
+    
+    const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
+    const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
+    const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
+    
+    if(Pack1>=3*PacketSize)
+      for(Index i=0; i<peeled_mc3; i+=3*PacketSize)
+        pack<3*PacketSize>(blockA, lhs, cols, i, count);
+    
+    if(Pack1>=2*PacketSize)
+      for(Index i=peeled_mc3; i<peeled_mc2; i+=2*PacketSize)
+        pack<2*PacketSize>(blockA, lhs, cols, i, count);
+    
+    if(Pack1>=1*PacketSize)
+      for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize)
+        pack<1*PacketSize>(blockA, lhs, cols, i, count);

    // do the same with mr==1
-    for(Index i=peeled_mc; i<rows; i++)
+    for(Index i=peeled_mc1; i<rows; i++)
    {
      for(Index k=0; k<i; k++)
-        blockA[count++] = lhs(i, k);              // normal
+        blockA[count++] = lhs(i, k);                   // normal

      blockA[count++] = numext::real(lhs(i, i));       // real (diagonal)

@@ -82,7 +89,8 @@ struct symm_pack_rhs
    Index end_k = k2 + rows;
    Index count = 0;
    const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride);
-    Index packet_cols = (cols/nr)*nr;
+    Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
+    Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;

    // first part: normal case
    for(Index j2=0; j2<k2; j2+=nr)
@@ -91,79 +99,151 @@ struct symm_pack_rhs
      {
        blockB[count+0] = rhs(k,j2+0);
        blockB[count+1] = rhs(k,j2+1);
-        if (nr==4)
+        if (nr>=4)
        {
          blockB[count+2] = rhs(k,j2+2);
          blockB[count+3] = rhs(k,j2+3);
        }
+        if (nr>=8)
+        {
+          blockB[count+4] = rhs(k,j2+4);
+          blockB[count+5] = rhs(k,j2+5);
+          blockB[count+6] = rhs(k,j2+6);
+          blockB[count+7] = rhs(k,j2+7);
+        }
        count += nr;
      }
    }

    // second part: diagonal block
-    for(Index j2=k2; j2<(std::min)(k2+rows,packet_cols); j2+=nr)
+    Index end8 = nr>=8 ? (std::min)(k2+rows,packet_cols8) : k2;
+    if(nr>=8)
    {
-      // again we can split vertically in three different parts (transpose, symmetric, normal)
-      // transpose
-      for(Index k=k2; k<j2; k++)
+      for(Index j2=k2; j2<end8; j2+=8)
      {
-        blockB[count+0] = numext::conj(rhs(j2+0,k));
-        blockB[count+1] = numext::conj(rhs(j2+1,k));
-        if (nr==4)
+        // again we can split vertically in three different parts (transpose, symmetric, normal)
+        // transpose
+        for(Index k=k2; k<j2; k++)
        {
+          blockB[count+0] = numext::conj(rhs(j2+0,k));
+          blockB[count+1] = numext::conj(rhs(j2+1,k));
          blockB[count+2] = numext::conj(rhs(j2+2,k));
          blockB[count+3] = numext::conj(rhs(j2+3,k));
+          blockB[count+4] = numext::conj(rhs(j2+4,k));
+          blockB[count+5] = numext::conj(rhs(j2+5,k));
+          blockB[count+6] = numext::conj(rhs(j2+6,k));
+          blockB[count+7] = numext::conj(rhs(j2+7,k));
+          count += 8;
        }
-        count += nr;
-      }
-      // symmetric
-      Index h = 0;
-      for(Index k=j2; k<j2+nr; k++)
-      {
-        // normal
-        for (Index w=0 ; w<h; ++w)
-          blockB[count+w] = rhs(k,j2+w);
-
-        blockB[count+h] = numext::real(rhs(k,k));
-
-        // transpose
-        for (Index w=h+1 ; w<nr; ++w)
-          blockB[count+w] = numext::conj(rhs(j2+w,k));
-        count += nr;
-        ++h;
-      }
-      // normal
-      for(Index k=j2+nr; k<end_k; k++)
-      {
-        blockB[count+0] = rhs(k,j2+0);
-        blockB[count+1] = rhs(k,j2+1);
-        if (nr==4)
+        // symmetric
+        Index h = 0;
+        for(Index k=j2; k<j2+8; k++)
        {
+          // normal
+          for (Index w=0 ; w<h; ++w)
+            blockB[count+w] = rhs(k,j2+w);
+
+          blockB[count+h] = numext::real(rhs(k,k));
+
+          // transpose
+          for (Index w=h+1 ; w<8; ++w)
+            blockB[count+w] = numext::conj(rhs(j2+w,k));
+          count += 8;
+          ++h;
+        }
+        // normal
+        for(Index k=j2+8; k<end_k; k++)
+        {
+          blockB[count+0] = rhs(k,j2+0);
+          blockB[count+1] = rhs(k,j2+1);
          blockB[count+2] = rhs(k,j2+2);
          blockB[count+3] = rhs(k,j2+3);
+          blockB[count+4] = rhs(k,j2+4);
+          blockB[count+5] = rhs(k,j2+5);
+          blockB[count+6] = rhs(k,j2+6);
+          blockB[count+7] = rhs(k,j2+7);
+          count += 8;
+        }
+      }
+    }
+    if(nr>=4)
+    {
+      for(Index j2=end8; j2<(std::min)(k2+rows,packet_cols4); j2+=4)
+      {
+        // again we can split vertically in three different parts (transpose, symmetric, normal)
+        // transpose
+        for(Index k=k2; k<j2; k++)
+        {
+          blockB[count+0] = numext::conj(rhs(j2+0,k));
+          blockB[count+1] = numext::conj(rhs(j2+1,k));
+          blockB[count+2] = numext::conj(rhs(j2+2,k));
+          blockB[count+3] = numext::conj(rhs(j2+3,k));
+          count += 4;
+        }
+        // symmetric
+        Index h = 0;
+        for(Index k=j2; k<j2+4; k++)
+        {
+          // normal
+          for (Index w=0 ; w<h; ++w)
+            blockB[count+w] = rhs(k,j2+w);
+
+          blockB[count+h] = numext::real(rhs(k,k));
+
+          // transpose
+          for (Index w=h+1 ; w<4; ++w)
+            blockB[count+w] = numext::conj(rhs(j2+w,k));
+          count += 4;
+          ++h;
+        }
+        // normal
+        for(Index k=j2+4; k<end_k; k++)
+        {
+          blockB[count+0] = rhs(k,j2+0);
+          blockB[count+1] = rhs(k,j2+1);
+          blockB[count+2] = rhs(k,j2+2);
+          blockB[count+3] = rhs(k,j2+3);
+          count += 4;
        }
-        count += nr;
      }
    }

    // third part: transposed
-    for(Index j2=k2+rows; j2<packet_cols; j2+=nr)
+    if(nr>=8)
    {
-      for(Index k=k2; k<end_k; k++)
+      for(Index j2=k2+rows; j2<packet_cols8; j2+=8)
      {
-        blockB[count+0] = numext::conj(rhs(j2+0,k));
-        blockB[count+1] = numext::conj(rhs(j2+1,k));
-        if (nr==4)
+        for(Index k=k2; k<end_k; k++)
        {
+          blockB[count+0] = numext::conj(rhs(j2+0,k));
+          blockB[count+1] = numext::conj(rhs(j2+1,k));
          blockB[count+2] = numext::conj(rhs(j2+2,k));
          blockB[count+3] = numext::conj(rhs(j2+3,k));
+          blockB[count+4] = numext::conj(rhs(j2+4,k));
+          blockB[count+5] = numext::conj(rhs(j2+5,k));
+          blockB[count+6] = numext::conj(rhs(j2+6,k));
+          blockB[count+7] = numext::conj(rhs(j2+7,k));
+          count += 8;
+        }
+      }
+    }
+    if(nr>=4)
+    {
+      for(Index j2=(std::max)(packet_cols8,k2+rows); j2<packet_cols4; j2+=4)
+      {
+        for(Index k=k2; k<end_k; k++)
+        {
+          blockB[count+0] = numext::conj(rhs(j2+0,k));
+          blockB[count+1] = numext::conj(rhs(j2+1,k));
+          blockB[count+2] = numext::conj(rhs(j2+2,k));
+          blockB[count+3] = numext::conj(rhs(j2+3,k));
+          count += 4;
        }
-        count += nr;
      }
    }

    // copy the remaining columns one at a time (=> the same with nr==1)
-    for(Index j2=packet_cols; j2<cols; ++j2)
+    for(Index j2=packet_cols4; j2<cols; ++j2)
    {
      // transpose
      Index half = (std::min)(end_k,j2);
@@ -261,11 +341,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
    // kc must smaller than mc
    kc = (std::min)(kc,mc);

-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-    std::size_t sizeB = sizeW + kc*cols;
+    std::size_t sizeB = kc*cols;
    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
    ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
-    Scalar* blockB = allocatedBlockB + sizeW;
+    Scalar* blockB = allocatedBlockB;

    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
    symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
@@ -348,11 +427,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
    Index mc = rows;  // cache block size along the M direction
    Index nc = cols;  // cache block size along the N direction
    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-    std::size_t sizeB = sizeW + kc*cols;
+    std::size_t sizeB = kc*cols;
    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
    ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
-    Scalar* blockB = allocatedBlockB + sizeW;
+    Scalar* blockB = allocatedBlockB;

    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
@@ -422,11 +500,11 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
      NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
      internal::traits<Dest>::Flags&RowMajorBit  ? RowMajor : ColMajor>
      ::run(
-        lhs.rows(), rhs.cols(),                 // sizes
-        &lhs.coeffRef(0,0),    lhs.outerStride(),  // lhs info
-        &rhs.coeffRef(0,0),    rhs.outerStride(),  // rhs info
-        &dst.coeffRef(0,0), dst.outerStride(),  // result info
-        actualAlpha                             // alpha
+        lhs.rows(), rhs.cols(),                     // sizes
+        &lhs.coeffRef(0,0),    lhs.outerStride(),   // lhs info
+        &rhs.coeffRef(0,0),    rhs.outerStride(),   // rhs info
+        &dst.coeffRef(0,0), dst.outerStride(),      // result info
+        actualAlpha                                 // alpha
      );
  }
 };
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -79,8 +79,8 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
  for (Index j=FirstTriangular ? bound : 0;
       j<(FirstTriangular ? size : bound);j+=2)
  {
-    register const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
-    register const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride;
+    const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
+    const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride;

    Scalar t0 = cjAlpha * rhs[j];
    Packet ptmp0 = pset1<Packet>(t0);
@@ -113,9 +113,9 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd

    for (size_t i=starti; i<alignedStart; ++i)
    {
-      res[i] += t0 * A0[i] + t1 * A1[i];
-      t2 += numext::conj(A0[i]) * rhs[i];
-      t3 += numext::conj(A1[i]) * rhs[i];
+      res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
+      t2 += cj1.pmul(A0[i], rhs[i]);
+      t3 += cj1.pmul(A1[i], rhs[i]);
    }
    // Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up)
    // gcc 4.2 does this optimization automatically.
@@ -147,7 +147,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
  }
  for (Index j=FirstTriangular ? 0 : bound;j<(FirstTriangular ? bound : size);j++)
  {
-    register const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
+    const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;

    Scalar t1 = cjAlpha * rhs[j];
    Scalar t2(0);
@@ -218,7 +218,7 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
    if(!EvalToDest)
    {
      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = dest.size();
+      Index size = dest.size();
      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
      #endif
      MappedDest(actualDestPtr, dest.size()) = dest;
@@ -227,7 +227,7 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
    if(!UseRhs)
    {
      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = rhs.size();
+      Index size = rhs.size();
      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
      #endif
      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -125,11 +125,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,

    std::size_t sizeA = kc*mc;
    std::size_t sizeB = kc*cols;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;

    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());

    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
    triangularBuffer.setZero();
@@ -187,7 +185,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
          pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);

          gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
-                      actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
+                      actualPanelWidth, actual_kc, 0, blockBOffset);

          // GEBP with remaining micro panel
          if (lengthTarget>0)
@@ -197,7 +195,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
            pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);

            gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
-                        actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
+                        actualPanelWidth, actual_kc, 0, blockBOffset);
          }
        }
      }
@@ -211,7 +209,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
          gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
            (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);

-          gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
+          gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0);
        }
      }
    }
@@ -265,12 +263,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction

    std::size_t sizeA = kc*mc;
-    std::size_t sizeB = kc*cols;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+    std::size_t sizeB = kc*cols+EIGEN_ALIGN_BYTES/sizeof(Scalar);

    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());

    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
    triangularBuffer.setZero();
@@ -304,6 +300,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
      Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;

      Scalar* geb = blockB+ts*ts;
+      geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar));

      pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs);

@@ -357,14 +354,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
                        actual_mc, panelLength, actualPanelWidth,
                        alpha,
                        actual_kc, actual_kc,  // strides
-                        blockOffset, blockOffset,// offsets
-                        blockW); // workspace
+                        blockOffset, blockOffset);// offsets
          }
        }
        gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
                    blockA, geb, actual_mc, actual_kc, rs,
                    alpha,
-                    -1, -1, 0, 0, blockW);
+                    -1, -1, 0, 0);
      }
    }
  }
--- a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
@@ -109,7 +109,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
 /* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \
   if (rows != depth) { \
 \
-     int nthr = mkl_domain_get_max_threads(MKL_BLAS); \
+     int nthr = mkl_domain_get_max_threads(EIGEN_MKL_DOMAIN_BLAS); \
 \
     if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \
     /* Most likely no benefit to call TRMM or GEMM from MKL*/ \
@@ -223,7 +223,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
 /* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \
   if (cols != depth) { \
 \
-     int nthr = mkl_domain_get_max_threads(MKL_BLAS); \
+     int nthr = mkl_domain_get_max_threads(EIGEN_MKL_DOMAIN_BLAS); \
 \
     if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \
     /* Most likely no benefit to call TRMM or GEMM from MKL*/ \
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -322,7 +322,7 @@ template<> struct trmv_selector<RowMajor>
    if(!DirectlyUseRhs)
    {
      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = actualRhs.size();
+      Index size = actualRhs.size();
      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
      #endif
      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
--- a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector_MKL.h
@@ -129,7 +129,6 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
   MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
 /* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
   if (size<(std::max)(rows,cols)) { \
-     typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \
     if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
     x = x_tmp.data(); \
     if (size<rows) { \
@@ -214,7 +213,6 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
   MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
 /* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
   if (size<(std::max)(rows,cols)) { \
-     typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \
     if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
     x = x_tmp.data(); \
     if (size<rows) { \
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -66,11 +66,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju

    std::size_t sizeA = kc*mc;
    std::size_t sizeB = kc*cols;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;

    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());

    conj_if<Conjugate> conj;
    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
@@ -158,7 +156,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
            pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);

            gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
-                        actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
+                        actualPanelWidth, actual_kc, 0, blockBOffset);
          }
        }
      }
@@ -174,7 +172,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
          {
            pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc);

-            gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0, blockW);
+            gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
          }
        }
      }
@@ -215,11 +213,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj

    std::size_t sizeA = kc*mc;
    std::size_t sizeB = kc*size;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;

    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());

    conj_if<Conjugate> conj;
    gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
@@ -285,8 +281,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
                          actual_mc, panelLength, actualPanelWidth,
                          Scalar(-1),
                          actual_kc, actual_kc, // strides
-                          panelOffset, panelOffset, // offsets
-                          blockW);  // workspace
+                          panelOffset, panelOffset); // offsets
            }

            // unblocked triangular solve
@@ -317,7 +312,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
        if (rs>0)
          gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
                      actual_mc, actual_kc, rs, Scalar(-1),
-                      -1, -1, 0, 0, blockW);
+                      -1, -1, 0, 0);
      }
    }
  }
--- a/Show More
+++ b/Show More