bump to 2.6.7

fix unit test compilation
bug #1063 : nest AutoDiffScalar by value to avoid dead references
2026-04-10 11:34:33 +08:00 · 2015-11-05 15:56:09 +01:00 · 2015-11-05 15:36:48 +01:00 · 2015-11-05 13:54:26 +01:00 · 2015-11-05 12:05:31 +01:00 · 2015-11-05 12:05:02 +01:00
502 changed files with 21691 additions and 21792 deletions
--- a/.hgeol
+++ b/.hgeol
@@ -1,9 +1,6 @@
 [patterns]
-*.sh = LF
-*.MINPACK = CRLF
 scripts/*.in = LF
 debug/msvc/*.dat = CRLF
-debug/msvc/*.natvis = CRLF
 unsupported/test/mpreal/*.* = CRLF
 ** = native

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -108,8 +108,7 @@ endif()
 set(EIGEN_TEST_MAX_SIZE "320" CACHE STRING "Maximal matrix/vector size, default is 320")

 macro(ei_add_cxx_compiler_flag FLAG)
-  string(REGEX REPLACE "-" "" SFLAG1 ${FLAG})
-  string(REGEX REPLACE "\\+" "p" SFLAG ${SFLAG1})
+  string(REGEX REPLACE "-" "" SFLAG ${FLAG})
  check_cxx_compiler_flag(${FLAG} COMPILER_SUPPORT_${SFLAG})
  if(COMPILER_SUPPORT_${SFLAG})
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG}")
@@ -119,7 +118,7 @@ endmacro(ei_add_cxx_compiler_flag)
 if(NOT MSVC)
  # We assume that other compilers are partly compatible with GNUCC
  
-#  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
  set(CMAKE_CXX_FLAGS_DEBUG "-g3")
  set(CMAKE_CXX_FLAGS_RELEASE "-g0 -O2")
  
@@ -143,9 +142,6 @@ if(NOT MSVC)
  ei_add_cxx_compiler_flag("-Wpointer-arith")
  ei_add_cxx_compiler_flag("-Wwrite-strings")
  ei_add_cxx_compiler_flag("-Wformat-security")
-  ei_add_cxx_compiler_flag("-Wshorten-64-to-32")
-  ei_add_cxx_compiler_flag("-Wenum-conversion")
-  ei_add_cxx_compiler_flag("-Wc++11-extensions")
  
  ei_add_cxx_compiler_flag("-Wno-psabi")
  ei_add_cxx_compiler_flag("-Wno-variadic-macros")
@@ -157,7 +153,6 @@ if(NOT MSVC)
  ei_add_cxx_compiler_flag("-wd981")                    # disable ICC's "operands are evaluated in unspecified order" remark
  ei_add_cxx_compiler_flag("-wd2304")                   # disbale ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor
  
-  
  # The -ansi flag must be added last, otherwise it is also used as a linker flag by check_cxx_compiler_flag making it fails
  # Moreover we should not set both -strict-ansi and -ansi
  check_cxx_compiler_flag("-strict-ansi" COMPILER_SUPPORT_STRICTANSI)
@@ -201,18 +196,6 @@ if(NOT MSVC)
    message(STATUS "Enabling SSE4.2 in tests/examples")
  endif()

-  option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" OFF)
-  if(EIGEN_TEST_AVX)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
-    message(STATUS "Enabling AVX in tests/examples")
-  endif()
-
-  option(EIGEN_TEST_FMA "Enable/Disable FMA in tests/examples" OFF)
-  if(EIGEN_TEST_FMA)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
-    message(STATUS "Enabling FMA in tests/examples")
-  endif()
-
  option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF)
  if(EIGEN_TEST_ALTIVEC)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
@@ -301,12 +284,6 @@ if(EIGEN_TEST_NO_EXPLICIT_ALIGNMENT)
  message(STATUS "Disabling alignment in tests/examples")
 endif()

-option(EIGEN_TEST_NO_EXCEPTIONS "Disables C++ exceptions" OFF)
-if(EIGEN_TEST_NO_EXCEPTIONS)
-  ei_add_cxx_compiler_flag("-fno-exceptions")
-  message(STATUS "Disabling exceptions in tests/examples")
-endif()
-
 option(EIGEN_TEST_C++0x "Enables all C++0x features." OFF)

 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
@@ -324,7 +301,7 @@ if(EIGEN_INCLUDE_INSTALL_DIR)
  )
 else()
  set(INCLUDE_INSTALL_DIR
-    "${CMAKE_INSTALL_PREFIX}/include/eigen3"
+    "include/eigen3"
    CACHE INTERNAL
    "The directory where we install the header files (internal)"
  )
@@ -427,7 +404,7 @@ if(cmake_generator_tolower MATCHES "makefile")
  message(STATUS "make install  | Install to ${CMAKE_INSTALL_PREFIX}. To change that:")
  message(STATUS "              |     cmake . -DCMAKE_INSTALL_PREFIX=yourpath")
  message(STATUS "              |   Eigen headers will then be installed to:")
-  message(STATUS "              |     ${INCLUDE_INSTALL_DIR}")
+  message(STATUS "              |     ${CMAKE_INSTALL_PREFIX}/${INCLUDE_INSTALL_DIR}")
  message(STATUS "              |   To install Eigen headers to a separate location, do:")
  message(STATUS "              |     cmake . -DEIGEN_INCLUDE_INSTALL_DIR=yourpath")
  message(STATUS "make doc      | Generate the API documentation, requires Doxygen & LaTeX")
@@ -441,31 +418,3 @@ else()
 endif()

 message(STATUS "")
-
-set ( EIGEN_CONFIG_CMAKE_PATH
-      lib${LIB_SUFFIX}/cmake/eigen3
-      CACHE PATH "The directory where the CMake files are installed"
-    )
-if ( NOT IS_ABSOLUTE EIGEN_CONFIG_CMAKE_PATH )
-  set ( EIGEN_CONFIG_CMAKE_PATH ${CMAKE_INSTALL_PREFIX}/${EIGEN_CONFIG_CMAKE_PATH} )
-endif ()
-
-set ( EIGEN_USE_FILE ${EIGEN_CONFIG_CMAKE_PATH}/UseEigen3.cmake )
-set ( EIGEN_VERSION_STRING ${EIGEN_VERSION_NUMBER} )
-set ( EIGEN_VERSION_MAJOR  ${EIGEN_WORLD_VERSION} )
-set ( EIGEN_VERSION_MINOR  ${EIGEN_MAJOR_VERSION} )
-set ( EIGEN_VERSION_PATCH  ${EIGEN_MINOR_VERSION} )
-set ( EIGEN_DEFINITIONS "")
-set ( EIGEN_INCLUDE_DIR ${INCLUDE_INSTALL_DIR} )
-set ( EIGEN_INCLUDE_DIRS ${EIGEN_INCLUDE_DIR} )
-set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} )
-
-configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
-                 ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
-                 @ONLY ESCAPE_QUOTES
-               )
-
-install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
-                ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
-          DESTINATION ${EIGEN_CONFIG_CMAKE_PATH}
-        )
--- a/CTestConfig.cmake
+++ b/CTestConfig.cmake
@@ -4,14 +4,10 @@
 ## # The following are required to uses Dart and the Cdash dashboard
 ##   ENABLE_TESTING()
 ##   INCLUDE(CTest)
-set(CTEST_PROJECT_NAME "Eigen")
+set(CTEST_PROJECT_NAME "Eigen3.2")
 set(CTEST_NIGHTLY_START_TIME "00:00:00 UTC")

 set(CTEST_DROP_METHOD "http")
 set(CTEST_DROP_SITE "manao.inria.fr")
-set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen")
+set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen3.2")
 set(CTEST_DROP_SITE_CDASH TRUE)
-set(CTEST_PROJECT_SUBPROJECTS
-Official
-Unsupported
-)
--- a/Eigen/Array
+++ b/Eigen/Array
@@ -0,0 +1,11 @@
+#ifndef EIGEN_ARRAY_MODULE_H
+#define EIGEN_ARRAY_MODULE_H
+
+// include Core first to handle Eigen2 support macros
+#include "Core"
+
+#ifndef EIGEN2_SUPPORT
+  #error The Eigen/Array header does no longer exist in Eigen3. All that functionality has moved to Eigen/Core.
+#endif
+
+#endif // EIGEN_ARRAY_MODULE_H
--- a/Eigen/Cholesky
+++ b/Eigen/Cholesky
@@ -10,11 +10,9 @@
  *
  *
  * This module provides two variants of the Cholesky decomposition for selfadjoint (hermitian) matrices.
-  * Those decompositions are also accessible via the following methods:
-  *  - MatrixBase::llt()
+  * Those decompositions are accessible via the following MatrixBase methods:
+  *  - MatrixBase::llt(),
  *  - MatrixBase::ldlt()
-  *  - SelfAdjointView::llt()
-  *  - SelfAdjointView::ldlt()
  *
  * \code
  * #include <Eigen/Cholesky>
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -14,42 +14,6 @@
 // first thing Eigen does: stop the compiler from committing suicide
 #include "src/Core/util/DisableStupidWarnings.h"

-// Handle NVCC/CUDA
-#ifdef __CUDACC__
-  // Do not try asserts on CUDA!
-  #ifndef EIGEN_NO_DEBUG
-  #define EIGEN_NO_DEBUG
-  #endif
-
-  #ifdef EIGEN_INTERNAL_DEBUGGING
-  #undef EIGEN_INTERNAL_DEBUGGING
-  #endif
-  
-  // Do not try to vectorize on CUDA!
-  #define EIGEN_DONT_VECTORIZE
-  
-  // All functions callable from CUDA code must be qualified with __device__
-  #define EIGEN_DEVICE_FUNC __host__ __device__
-  
-#else
-  #define EIGEN_DEVICE_FUNC
-  
-#endif
-
-#if defined(__CUDA_ARCH__)
-  #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
-#else
-  #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
-#endif
-
-#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS)
-  #define EIGEN_EXCEPTIONS
-#endif
-
-#ifdef EIGEN_EXCEPTIONS
-  #include <new>
-#endif
-
 // then include this file where all our macros are defined. It's really important to do it first because
 // it's where we do all the alignment settings (platform detection and honoring the user's will if he
 // defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization.
@@ -118,16 +82,7 @@
    #ifdef __SSE4_2__
      #define EIGEN_VECTORIZE_SSE4_2
    #endif
-    #ifdef __AVX__
-      #define EIGEN_VECTORIZE_AVX
-      #define EIGEN_VECTORIZE_SSE3
-      #define EIGEN_VECTORIZE_SSSE3
-      #define EIGEN_VECTORIZE_SSE4_1
-      #define EIGEN_VECTORIZE_SSE4_2
-    #endif
-    #ifdef __FMA__
-      #define EIGEN_VECTORIZE_FMA
-    #endif
+
    // include files

    // This extern "C" works around a MINGW-w64 compilation issue
@@ -157,9 +112,6 @@
        #ifdef EIGEN_VECTORIZE_SSE4_2
        #include <nmmintrin.h>
        #endif
-        #ifdef EIGEN_VECTORIZE_AVX
-        #include <immintrin.h>
-        #endif
      #endif
    } // end extern "C"
  #elif defined __ALTIVEC__
@@ -171,7 +123,7 @@
    #undef bool
    #undef vector
    #undef pixel
-  #elif defined  __ARM_NEON__
+  #elif defined  __ARM_NEON
    #define EIGEN_VECTORIZE
    #define EIGEN_VECTORIZE_NEON
    #include <arm_neon.h>
@@ -217,13 +169,19 @@
  #include <intrin.h>
 #endif

+#if defined(_CPPUNWIND) || defined(__EXCEPTIONS)
+  #define EIGEN_EXCEPTIONS
+#endif
+
+#ifdef EIGEN_EXCEPTIONS
+  #include <new>
+#endif
+
 /** \brief Namespace containing all symbols from the %Eigen library. */
 namespace Eigen {

 inline static const char *SimdInstructionSetsInUse(void) {
-#if defined(EIGEN_VECTORIZE_AVX)
-  return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
-#elif defined(EIGEN_VECTORIZE_SSE4_2)
+#if defined(EIGEN_VECTORIZE_SSE4_2)
  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
 #elif defined(EIGEN_VECTORIZE_SSE4_1)
  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
@@ -244,9 +202,34 @@ inline static const char *SimdInstructionSetsInUse(void) {

 } // end namespace Eigen

-#if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT
-// This will generate an error message:
-#error Eigen2-support is only available up to version 3.2. Please go to "http://eigen.tuxfamily.org/index.php?title=Eigen2" for further information
+#define STAGE10_FULL_EIGEN2_API             10
+#define STAGE20_RESOLVE_API_CONFLICTS       20
+#define STAGE30_FULL_EIGEN3_API             30
+#define STAGE40_FULL_EIGEN3_STRICTNESS      40
+#define STAGE99_NO_EIGEN2_SUPPORT           99
+
+#if   defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS
+  #define EIGEN2_SUPPORT
+  #define EIGEN2_SUPPORT_STAGE STAGE40_FULL_EIGEN3_STRICTNESS
+#elif defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API
+  #define EIGEN2_SUPPORT
+  #define EIGEN2_SUPPORT_STAGE STAGE30_FULL_EIGEN3_API
+#elif defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS
+  #define EIGEN2_SUPPORT
+  #define EIGEN2_SUPPORT_STAGE STAGE20_RESOLVE_API_CONFLICTS
+#elif defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API
+  #define EIGEN2_SUPPORT
+  #define EIGEN2_SUPPORT_STAGE STAGE10_FULL_EIGEN2_API
+#elif defined EIGEN2_SUPPORT
+  // default to stage 3, that's what it's always meant
+  #define EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API
+  #define EIGEN2_SUPPORT_STAGE STAGE30_FULL_EIGEN3_API
+#else
+  #define EIGEN2_SUPPORT_STAGE STAGE99_NO_EIGEN2_SUPPORT
+#endif
+
+#ifdef EIGEN2_SUPPORT
+#undef minor
 #endif

 // we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
@@ -276,13 +259,7 @@ using std::ptrdiff_t;
 #include "src/Core/MathFunctions.h"
 #include "src/Core/GenericPacketMath.h"

-#if defined EIGEN_VECTORIZE_AVX
-  // Use AVX for floats and doubles, SSE for integers
-  #include "src/Core/arch/SSE/PacketMath.h"
-  #include "src/Core/arch/SSE/Complex.h"
-  #include "src/Core/arch/AVX/PacketMath.h"
-  #include "src/Core/arch/AVX/Complex.h"
-#elif defined EIGEN_VECTORIZE_SSE
+#if defined EIGEN_VECTORIZE_SSE
  #include "src/Core/arch/SSE/PacketMath.h"
  #include "src/Core/arch/SSE/MathFunctions.h"
  #include "src/Core/arch/SSE/Complex.h"
@@ -296,30 +273,17 @@ using std::ptrdiff_t;

 #include "src/Core/arch/Default/Settings.h"

-#include "src/Core/functors/BinaryFunctors.h"
-#include "src/Core/functors/UnaryFunctors.h"
-#include "src/Core/functors/NullaryFunctors.h"
-#include "src/Core/functors/StlFunctors.h"
-
+#include "src/Core/Functors.h"
 #include "src/Core/DenseCoeffsBase.h"
 #include "src/Core/DenseBase.h"
 #include "src/Core/MatrixBase.h"
 #include "src/Core/EigenBase.h"

-#ifdef EIGEN_ENABLE_EVALUATORS
-#include "src/Core/functors/AssignmentFunctors.h"
-#include "src/Core/Product.h"
-#include "src/Core/CoreEvaluators.h"
-#include "src/Core/AssignEvaluator.h"
-#include "src/Core/ProductEvaluators.h"
-#endif
-
 #ifndef EIGEN_PARSED_BY_DOXYGEN // work around Doxygen bug triggered by Assign.h r814874
                                // at least confirmed with Doxygen 1.5.5 and 1.5.6
  #include "src/Core/Assign.h"
 #endif

-#include "src/Core/ArrayBase.h"
 #include "src/Core/util/BlasUtil.h"
 #include "src/Core/DenseStorage.h"
 #include "src/Core/NestByValue.h"
@@ -383,6 +347,7 @@ using std::ptrdiff_t;
 #include "src/Core/Random.h"
 #include "src/Core/Replicate.h"
 #include "src/Core/Reverse.h"
+#include "src/Core/ArrayBase.h"
 #include "src/Core/ArrayWrapper.h"

 #ifdef EIGEN_USE_BLAS
@@ -404,4 +369,8 @@ using std::ptrdiff_t;

 #include "src/Core/util/ReenableStupidWarnings.h"

+#ifdef EIGEN2_SUPPORT
+#include "Eigen2Support"
+#endif
+
 #endif // EIGEN_CORE_H
--- a/Eigen/Eigen
+++ b/Eigen/Eigen
@@ -1,2 +1,2 @@
 #include "Dense"
-#include "Sparse"
+//#include "Sparse"
--- a/Eigen/Eigen2Support
+++ b/Eigen/Eigen2Support
@@ -0,0 +1,95 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN2SUPPORT_H
+#define EIGEN2SUPPORT_H
+
+#if (!defined(EIGEN2_SUPPORT)) || (!defined(EIGEN_CORE_H))
+#error Eigen2 support must be enabled by defining EIGEN2_SUPPORT before including any Eigen header
+#endif
+
+#ifndef EIGEN_NO_EIGEN2_DEPRECATED_WARNING
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
+#warning "Eigen2 support is deprecated in Eigen 3.2.x and it will be removed in Eigen 3.3. (Define EIGEN_NO_EIGEN2_DEPRECATED_WARNING to disable this warning)"
+#else
+#pragma message ("Eigen2 support is deprecated in Eigen 3.2.x and it will be removed in Eigen 3.3. (Define EIGEN_NO_EIGEN2_DEPRECATED_WARNING to disable this warning)")
+#endif
+
+#endif // EIGEN_NO_EIGEN2_DEPRECATED_WARNING
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** \ingroup Support_modules
+  * \defgroup Eigen2Support_Module Eigen2 support module
+  *
+  * \warning Eigen2 support is deprecated in Eigen 3.2.x and it will be removed in Eigen 3.3.
+  *
+  * This module provides a couple of deprecated functions improving the compatibility with Eigen2.
+  * 
+  * To use it, define EIGEN2_SUPPORT before including any Eigen header
+  * \code
+  * #define EIGEN2_SUPPORT
+  * \endcode
+  *
+  */
+
+#include "src/Eigen2Support/Macros.h"
+#include "src/Eigen2Support/Memory.h"
+#include "src/Eigen2Support/Meta.h"
+#include "src/Eigen2Support/Lazy.h"
+#include "src/Eigen2Support/Cwise.h"
+#include "src/Eigen2Support/CwiseOperators.h"
+#include "src/Eigen2Support/TriangularSolver.h"
+#include "src/Eigen2Support/Block.h"
+#include "src/Eigen2Support/VectorBlock.h"
+#include "src/Eigen2Support/Minor.h"
+#include "src/Eigen2Support/MathFunctions.h"
+
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+// Eigen2 used to include iostream
+#include<iostream>
+
+#define EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \
+using Eigen::Matrix##SizeSuffix##TypeSuffix; \
+using Eigen::Vector##SizeSuffix##TypeSuffix; \
+using Eigen::RowVector##SizeSuffix##TypeSuffix;
+
+#define EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(TypeSuffix) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 2) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 3) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 4) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, X) \
+
+#define EIGEN_USING_MATRIX_TYPEDEFS \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(i) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(f) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(d) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(cf) \
+EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(cd)
+
+#define USING_PART_OF_NAMESPACE_EIGEN \
+EIGEN_USING_MATRIX_TYPEDEFS \
+using Eigen::Matrix; \
+using Eigen::MatrixBase; \
+using Eigen::ei_random; \
+using Eigen::ei_real; \
+using Eigen::ei_imag; \
+using Eigen::ei_conj; \
+using Eigen::ei_abs; \
+using Eigen::ei_abs2; \
+using Eigen::ei_sqrt; \
+using Eigen::ei_exp; \
+using Eigen::ei_log; \
+using Eigen::ei_sin; \
+using Eigen::ei_cos;
+
+#endif // EIGEN2SUPPORT_H
--- a/Eigen/Geometry
+++ b/Eigen/Geometry
@@ -33,23 +33,27 @@
 #include "src/Geometry/OrthoMethods.h"
 #include "src/Geometry/EulerAngles.h"

-#include "src/Geometry/Homogeneous.h"
-#include "src/Geometry/RotationBase.h"
-#include "src/Geometry/Rotation2D.h"
-#include "src/Geometry/Quaternion.h"
-#include "src/Geometry/AngleAxis.h"
-#include "src/Geometry/Transform.h"
-#include "src/Geometry/Translation.h"
-#include "src/Geometry/Scaling.h"
-#include "src/Geometry/Hyperplane.h"
-#include "src/Geometry/ParametrizedLine.h"
-#include "src/Geometry/AlignedBox.h"
-#include "src/Geometry/Umeyama.h"
+#if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
+  #include "src/Geometry/Homogeneous.h"
+  #include "src/Geometry/RotationBase.h"
+  #include "src/Geometry/Rotation2D.h"
+  #include "src/Geometry/Quaternion.h"
+  #include "src/Geometry/AngleAxis.h"
+  #include "src/Geometry/Transform.h"
+  #include "src/Geometry/Translation.h"
+  #include "src/Geometry/Scaling.h"
+  #include "src/Geometry/Hyperplane.h"
+  #include "src/Geometry/ParametrizedLine.h"
+  #include "src/Geometry/AlignedBox.h"
+  #include "src/Geometry/Umeyama.h"

-// Use the SSE optimized version whenever possible. At the moment the
-// SSE version doesn't compile when AVX is enabled
-#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
-#include "src/Geometry/arch/Geometry_SSE.h"
+  #if defined EIGEN_VECTORIZE_SSE
+    #include "src/Geometry/arch/Geometry_SSE.h"
+  #endif
+#endif
+
+#ifdef EIGEN2_SUPPORT
+#include "src/Eigen2Support/Geometry/All.h"
 #endif

 #include "src/Core/util/ReenableStupidWarnings.h"
--- a/Eigen/LU
+++ b/Eigen/LU
@@ -27,12 +27,14 @@
 #include "src/LU/Determinant.h"
 #include "src/LU/Inverse.h"

-// Use the SSE optimized version whenever possible. At the moment the
-// SSE version doesn't compile when AVX is enabled
-#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
+#if defined EIGEN_VECTORIZE_SSE
  #include "src/LU/arch/Inverse_SSE.h"
 #endif

+#ifdef EIGEN2_SUPPORT
+  #include "src/Eigen2Support/LU.h"
+#endif
+
 #include "src/Core/util/ReenableStupidWarnings.h"

 #endif // EIGEN_LU_MODULE_H
--- a/Eigen/LeastSquares
+++ b/Eigen/LeastSquares
@@ -0,0 +1,32 @@
+#ifndef EIGEN_REGRESSION_MODULE_H
+#define EIGEN_REGRESSION_MODULE_H
+
+#ifndef EIGEN2_SUPPORT
+#error LeastSquares is only available in Eigen2 support mode (define EIGEN2_SUPPORT)
+#endif
+
+// exclude from normal eigen3-only documentation
+#ifdef EIGEN2_SUPPORT
+
+#include "Core"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+#include "Eigenvalues"
+#include "Geometry"
+
+/** \defgroup LeastSquares_Module LeastSquares module
+  * This module provides linear regression and related features.
+  *
+  * \code
+  * #include <Eigen/LeastSquares>
+  * \endcode
+  */
+
+#include "src/Eigen2Support/LeastSquares.h"
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN2_SUPPORT
+
+#endif // EIGEN_REGRESSION_MODULE_H
--- a/Eigen/QR
+++ b/Eigen/QR
@@ -15,9 +15,7 @@
  *
  * This module provides various QR decompositions
  * This module also provides some MatrixBase methods, including:
-  *  - MatrixBase::householderQr()
-  *  - MatrixBase::colPivHouseholderQr()
-  *  - MatrixBase::fullPivHouseholderQr()
+  *  - MatrixBase::qr(),
  *
  * \code
  * #include <Eigen/QR>
@@ -33,7 +31,15 @@
 #include "src/QR/ColPivHouseholderQR_MKL.h"
 #endif

+#ifdef EIGEN2_SUPPORT
+#include "src/Eigen2Support/QR.h"
+#endif
+
 #include "src/Core/util/ReenableStupidWarnings.h"

+#ifdef EIGEN2_SUPPORT
+#include "Eigenvalues"
+#endif
+
 #endif // EIGEN_QR_MODULE_H
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
--- a/Eigen/SVD
+++ b/Eigen/SVD
@@ -21,13 +21,16 @@
  */

 #include "src/misc/Solve.h"
-#include "src/SVD/SVDBase.h"
 #include "src/SVD/JacobiSVD.h"
 #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
 #include "src/SVD/JacobiSVD_MKL.h"
 #endif
 #include "src/SVD/UpperBidiagonalization.h"

+#ifdef EIGEN2_SUPPORT
+#include "src/Eigen2Support/SVD.h"
+#endif
+
 #include "src/Core/util/ReenableStupidWarnings.h"

 #endif // EIGEN_SVD_MODULE_H
--- a/Eigen/SparseCore
+++ b/Eigen/SparseCore
@@ -14,7 +14,7 @@
 /** 
  * \defgroup SparseCore_Module SparseCore module
  *
-  * This module provides a sparse matrix representation, and basic associatd matrix manipulations
+  * This module provides a sparse matrix representation, and basic associated matrix manipulations
  * and operations.
  *
  * See the \ref TutorialSparse "Sparse tutorial"
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -43,7 +43,7 @@ namespace internal {
  * Remember that Cholesky decompositions are not rank-revealing. Also, do not use a Cholesky
  * decomposition to determine whether a system of equations has a solution.
  *
-  * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
+  * \sa MatrixBase::ldlt(), class LLT
  */
 template<typename _MatrixType, int _UpLo> class LDLT
 {
@@ -151,6 +151,13 @@ template<typename _MatrixType, int _UpLo> class LDLT
      eigen_assert(m_isInitialized && "LDLT is not initialized.");
      return m_sign == internal::PositiveSemiDef || m_sign == internal::ZeroSign;
    }
+    
+    #ifdef EIGEN2_SUPPORT
+    inline bool isPositiveDefinite() const
+    {
+      return isPositive();
+    }
+    #endif

    /** \returns true if the matrix is negative (semidefinite) */
    inline bool isNegative(void) const
@@ -172,7 +179,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
      * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function
      * computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular.
      *
-      * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt()
+      * \sa MatrixBase::ldlt()
      */
    template<typename Rhs>
    inline const internal::solve_retval<LDLT, Rhs>
@@ -184,6 +191,15 @@ template<typename _MatrixType, int _UpLo> class LDLT
      return internal::solve_retval<LDLT, Rhs>(*this, b.derived());
    }

+    #ifdef EIGEN2_SUPPORT
+    template<typename OtherDerived, typename ResultType>
+    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
+    {
+      *result = this->solve(b);
+      return true;
+    }
+    #endif
+
    template<typename Derived>
    bool solveInPlace(MatrixBase<Derived> &bAndX) const;

@@ -219,6 +235,11 @@ template<typename _MatrixType, int _UpLo> class LDLT
    }

  protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }

    /** \internal
      * Used to compute and store the Cholesky decomposition A = L D L^* = U^* D U.
@@ -246,7 +267,6 @@ template<> struct ldlt_inplace<Lower>
    typedef typename MatrixType::Scalar Scalar;
    typedef typename MatrixType::RealScalar RealScalar;
    typedef typename MatrixType::Index Index;
-    typedef typename TranspositionType::StorageIndexType IndexType;
    eigen_assert(mat.rows()==mat.cols());
    const Index size = mat.rows();

@@ -266,7 +286,7 @@ template<> struct ldlt_inplace<Lower>
      mat.diagonal().tail(size-k).cwiseAbs().maxCoeff(&index_of_biggest_in_corner);
      index_of_biggest_in_corner += k;

-      transpositions.coeffRef(k) = IndexType(index_of_biggest_in_corner);
+      transpositions.coeffRef(k) = index_of_biggest_in_corner;
      if(k != index_of_biggest_in_corner)
      {
        // apply the transposition while taking care to consider only
@@ -275,7 +295,7 @@ template<> struct ldlt_inplace<Lower>
        mat.row(k).head(k).swap(mat.row(index_of_biggest_in_corner).head(k));
        mat.col(k).tail(s).swap(mat.col(index_of_biggest_in_corner).tail(s));
        std::swap(mat.coeffRef(k,k),mat.coeffRef(index_of_biggest_in_corner,index_of_biggest_in_corner));
-        for(Index i=k+1;i<index_of_biggest_in_corner;++i)
+        for(int i=k+1;i<index_of_biggest_in_corner;++i)
        {
          Scalar tmp = mat.coeffRef(i,k);
          mat.coeffRef(i,k) = numext::conj(mat.coeffRef(index_of_biggest_in_corner,i));
@@ -419,6 +439,8 @@ template<typename MatrixType> struct LDLT_Traits<MatrixType,Upper>
 template<typename MatrixType, int _UpLo>
 LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const MatrixType& a)
 {
+  check_template_parameters();
+  
  eigen_assert(a.rows()==a.cols());
  const Index size = a.rows();

@@ -427,6 +449,7 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const MatrixType& a)
  m_transpositions.resize(size);
  m_isInitialized = false;
  m_temporary.resize(size);
+  m_sign = internal::ZeroSign;

  internal::ldlt_inplace<UpLo>::unblocked(m_matrix, m_transpositions, m_temporary, m_sign);

@@ -441,9 +464,8 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const MatrixType& a)
  */
 template<typename MatrixType, int _UpLo>
 template<typename Derived>
-LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Derived>& w, const typename NumTraits<typename MatrixType::Scalar>::Real& sigma)
+LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Derived>& w, const typename LDLT<MatrixType,_UpLo>::RealScalar& sigma)
 {
-  typedef typename TranspositionType::StorageIndexType IndexType;
  const Index size = w.rows();
  if (m_isInitialized)
  {
@@ -455,7 +477,7 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Deri
    m_matrix.setZero();
    m_transpositions.resize(size);
    for (Index i = 0; i < size; i++)
-      m_transpositions.coeffRef(i) = IndexType(i);
+      m_transpositions.coeffRef(i) = i;
    m_temporary.resize(size);
    m_sign = sigma>=0 ? internal::PositiveSemiDef : internal::NegativeSemiDef;
    m_isInitialized = true;
@@ -486,7 +508,7 @@ struct solve_retval<LDLT<_MatrixType,_UpLo>, Rhs>
    // dst = D^-1 (L^-1 P b)
    // more precisely, use pseudo-inverse of D (see bug 241)
    using std::abs;
-    EIGEN_USING_STD_MATH(max);
+    using std::max;
    typedef typename LDLTType::MatrixType MatrixType;
    typedef typename LDLTType::RealScalar RealScalar;
    const typename Diagonal<const MatrixType>::RealReturnType vectorD(dec().vectorD());
@@ -497,6 +519,7 @@ struct solve_retval<LDLT<_MatrixType,_UpLo>, Rhs>
    // diagonal element is not well justified and to numerical issues in some cases.
    // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
    RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
+    
    for (Index i = 0; i < vectorD.size(); ++i) {
      if(abs(vectorD(i)) > tolerance)
        dst.row(i) /= vectorD(i);
@@ -563,10 +586,8 @@ MatrixType LDLT<MatrixType,_UpLo>::reconstructedMatrix() const
  return res;
 }

-#ifndef __CUDACC__
 /** \cholesky_module
  * \returns the Cholesky decomposition with full pivoting without square root of \c *this
-  * \sa MatrixBase::ldlt()
  */
 template<typename MatrixType, unsigned int UpLo>
 inline const LDLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
@@ -577,7 +598,6 @@ SelfAdjointView<MatrixType, UpLo>::ldlt() const

 /** \cholesky_module
  * \returns the Cholesky decomposition with full pivoting without square root of \c *this
-  * \sa SelfAdjointView::ldlt()
  */
 template<typename Derived>
 inline const LDLT<typename MatrixBase<Derived>::PlainObject>
@@ -585,7 +605,6 @@ MatrixBase<Derived>::ldlt() const
 {
  return LDLT<PlainObject>(derived());
 }
-#endif // __CUDACC__

 } // end namespace Eigen

--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -41,7 +41,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
  * Example: \include LLT_example.cpp
  * Output: \verbinclude LLT_example.out
  *    
-  * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
+  * \sa MatrixBase::llt(), class LDLT
  */
 /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
  * Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
@@ -115,7 +115,7 @@ template<typename _MatrixType, int _UpLo> class LLT
      * Example: \include LLT_solve.cpp
      * Output: \verbinclude LLT_solve.out
      *
-      * \sa solveInPlace(), MatrixBase::llt(), SelfAdjointView::llt()
+      * \sa solveInPlace(), MatrixBase::llt()
      */
    template<typename Rhs>
    inline const internal::solve_retval<LLT, Rhs>
@@ -127,6 +127,17 @@ template<typename _MatrixType, int _UpLo> class LLT
      return internal::solve_retval<LLT, Rhs>(*this, b.derived());
    }

+    #ifdef EIGEN2_SUPPORT
+    template<typename OtherDerived, typename ResultType>
+    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
+    {
+      *result = this->solve(b);
+      return true;
+    }
+    
+    bool isPositiveDefinite() const { return true; }
+    #endif
+
    template<typename Derived>
    void solveInPlace(MatrixBase<Derived> &bAndX) const;

@@ -163,6 +174,12 @@ template<typename _MatrixType, int _UpLo> class LLT
    LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);

  protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
    /** \internal
      * Used to compute and store L
      * The strict upper part is not used and even not initialized.
@@ -272,7 +289,7 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower>
        return k;
      mat.coeffRef(k,k) = x = sqrt(x);
      if (k>0 && rs>0) A21.noalias() -= A20 * A10.adjoint();
-      if (rs>0) A21 *= RealScalar(1)/x;
+      if (rs>0) A21 /= x;
    }
    return -1;
  }
@@ -373,6 +390,8 @@ template<typename MatrixType> struct LLT_Traits<MatrixType,Upper>
 template<typename MatrixType, int _UpLo>
 LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const MatrixType& a)
 {
+  check_template_parameters();
+  
  eigen_assert(a.rows()==a.cols());
  const Index size = a.rows();
  m_matrix.resize(size, size);
@@ -454,10 +473,8 @@ MatrixType LLT<MatrixType,_UpLo>::reconstructedMatrix() const
  return matrixL() * matrixL().adjoint().toDenseMatrix();
 }

-#ifndef __CUDACC__
 /** \cholesky_module
  * \returns the LLT decomposition of \c *this
-  * \sa SelfAdjointView::llt()
  */
 template<typename Derived>
 inline const LLT<typename MatrixBase<Derived>::PlainObject>
@@ -468,7 +485,6 @@ MatrixBase<Derived>::llt() const

 /** \cholesky_module
  * \returns the LLT decomposition of \c *this
-  * \sa SelfAdjointView::llt()
  */
 template<typename MatrixType, unsigned int UpLo>
 inline const LLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
@@ -476,8 +492,7 @@ SelfAdjointView<MatrixType, UpLo>::llt() const
 {
  return LLT<PlainObject,UpLo>(m_matrix);
 }
-#endif // __CUDACC__
-  
+
 } // end namespace Eigen

 #endif // EIGEN_LLT_H
--- a/Eigen/src/Cholesky/LLT_MKL.h
+++ b/Eigen/src/Cholesky/LLT_MKL.h
@@ -60,7 +60,7 @@ template<> struct mkl_llt<EIGTYPE> \
    lda = m.outerStride(); \
 \
    info = LAPACKE_##MKLPREFIX##potrf( matrix_order, uplo, size, (MKLTYPE*)a, lda ); \
-    info = (info==0) ? Success : NumericalIssue; \
+    info = (info==0) ? -1 : info>0 ? info-1 : size; \
    return info; \
  } \
 }; \
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -78,7 +78,7 @@ cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_Index>& mat)
  {
    res.itype = CHOLMOD_INT;
  }
-  else if (internal::is_same<_Index,UF_long>::value)
+  else if (internal::is_same<_Index,SuiteSparse_long>::value)
  {
    res.itype = CHOLMOD_LONG;
  }
@@ -395,7 +395,7 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl
    CholmodSimplicialLLT(const MatrixType& matrix) : Base()
    {
      init();
-      compute(matrix);
+      Base::compute(matrix);
    }

    ~CholmodSimplicialLLT() {}
@@ -442,7 +442,7 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp
    CholmodSimplicialLDLT(const MatrixType& matrix) : Base()
    {
      init();
-      compute(matrix);
+      Base::compute(matrix);
    }

    ~CholmodSimplicialLDLT() {}
@@ -487,7 +487,7 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
    CholmodSupernodalLLT(const MatrixType& matrix) : Base()
    {
      init();
-      compute(matrix);
+      Base::compute(matrix);
    }

    ~CholmodSupernodalLLT() {}
@@ -534,7 +534,7 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
    CholmodDecomposition(const MatrixType& matrix) : Base()
    {
      init();
-      compute(matrix);
+      Base::compute(matrix);
    }

    ~CholmodDecomposition() {}
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -69,7 +69,6 @@ class Array
      * the usage of 'using'. This should be done only for operator=.
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array& operator=(const EigenBase<OtherDerived> &other)
    {
      return Base::operator=(other);
@@ -85,7 +84,6 @@ class Array
      * remain row-vectors and vectors remain vectors.
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array& operator=(const ArrayBase<OtherDerived>& other)
    {
      return Base::_set(other);
@@ -94,7 +92,6 @@ class Array
    /** This is a special case of the templated operator=. Its purpose is to
      * prevent a default operator= from hiding the templated operator=.
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array& operator=(const Array& other)
    {
      return Base::_set(other);
@@ -110,7 +107,6 @@ class Array
      *
      * \sa resize(Index,Index)
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array() : Base()
    {
      Base::_check_template_params();
@@ -120,7 +116,6 @@ class Array
 #ifndef EIGEN_PARSED_BY_DOXYGEN
    // FIXME is it still needed ??
    /** \internal */
-    EIGEN_DEVICE_FUNC
    Array(internal::constructor_without_unaligned_array_assert)
      : Base(internal::constructor_without_unaligned_array_assert())
    {
@@ -144,48 +139,41 @@ class Array
    }
 #endif

-
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename T>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE explicit Array(const T& x)
+    /** Constructs a vector or row-vector with given dimension. \only_for_vectors
+      *
+      * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
+      * it is redundant to pass the dimension here, so it makes more sense to use the default
+      * constructor Matrix() instead.
+      */
+    EIGEN_STRONG_INLINE explicit Array(Index dim)
+      : Base(dim, RowsAtCompileTime == 1 ? 1 : dim, ColsAtCompileTime == 1 ? 1 : dim)
    {
      Base::_check_template_params();
-      Base::template _init1<T>(x);
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Array)
+      eigen_assert(dim >= 0);
+      eigen_assert(SizeAtCompileTime == Dynamic || SizeAtCompileTime == dim);
+      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
    }

+    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const T0& val0, const T1& val1)
    {
      Base::_check_template_params();
      this->template _init2<T0,T1>(val0, val1);
    }
    #else
-    /** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */
-    EIGEN_DEVICE_FUNC explicit Array(const Scalar *data);
-    /** Constructs a vector or row-vector with given dimension. \only_for_vectors
+    /** constructs an uninitialized matrix with \a rows rows and \a cols columns.
      *
-      * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
-      * it is redundant to pass the dimension here, so it makes more sense to use the default
-      * constructor Array() instead.
-      */
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE explicit Array(Index dim);
-    /** constructs an initialized 1x1 Array with the given coefficient */
-    Array(const Scalar& value);
-    /** constructs an uninitialized array with \a rows rows and \a cols columns.
-      *
-      * This is useful for dynamic-size arrays. For fixed-size arrays,
+      * This is useful for dynamic-size matrices. For fixed-size matrices,
      * it is redundant to pass these parameters, so one should use the default constructor
-      * Array() instead. */
+      * Matrix() instead. */
    Array(Index rows, Index cols);
    /** constructs an initialized 2D vector with given coefficients */
    Array(const Scalar& val0, const Scalar& val1);
    #endif

    /** constructs an initialized 3D vector with given coefficients */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2)
    {
      Base::_check_template_params();
@@ -195,7 +183,6 @@ class Array
      m_storage.data()[2] = val2;
    }
    /** constructs an initialized 4D vector with given coefficients */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3)
    {
      Base::_check_template_params();
@@ -206,9 +193,10 @@ class Array
      m_storage.data()[3] = val3;
    }

+    explicit Array(const Scalar *data);
+
    /** Constructor copying the value of the expression \a other */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const ArrayBase<OtherDerived>& other)
             : Base(other.rows() * other.cols(), other.rows(), other.cols())
    {
@@ -216,7 +204,6 @@ class Array
      Base::_set_noalias(other);
    }
    /** Copy constructor */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const Array& other)
            : Base(other.rows() * other.cols(), other.rows(), other.cols())
    {
@@ -225,7 +212,6 @@ class Array
    }
    /** Copy constructor with in-place evaluation */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const ReturnByValue<OtherDerived>& other)
    {
      Base::_check_template_params();
@@ -235,7 +221,6 @@ class Array

    /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other)
      : Base(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
    {
@@ -251,8 +236,8 @@ class Array
    void swap(ArrayBase<OtherDerived> const & other)
    { this->_swap(other.derived()); }

-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }
+    inline Index innerStride() const { return 1; }
+    inline Index outerStride() const { return this->innerSize(); }

    #ifdef EIGEN_ARRAY_PLUGIN
    #include EIGEN_ARRAY_PLUGIN
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@@ -46,9 +46,6 @@ template<typename Derived> class ArrayBase

    typedef ArrayBase Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl;

-    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator*;
-
    typedef typename internal::traits<Derived>::StorageKind StorageKind;
    typedef typename internal::traits<Derived>::Index Index;
    typedef typename internal::traits<Derived>::Scalar Scalar;
@@ -56,6 +53,7 @@ template<typename Derived> class ArrayBase
    typedef typename NumTraits<Scalar>::Real RealScalar;

    typedef DenseBase<Derived> Base;
+    using Base::operator*;
    using Base::RowsAtCompileTime;
    using Base::ColsAtCompileTime;
    using Base::SizeAtCompileTime;
@@ -118,50 +116,40 @@ template<typename Derived> class ArrayBase
    /** Special case of the template operator=, in order to prevent the compiler
      * from generating a default operator= (issue hit with g++ 4.1)
      */
-    EIGEN_DEVICE_FUNC
    Derived& operator=(const ArrayBase& other)
    {
      return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
    }

-    EIGEN_DEVICE_FUNC
-    Derived& operator+=(const Scalar& scalar);
-    EIGEN_DEVICE_FUNC
-    Derived& operator-=(const Scalar& scalar);
+    Derived& operator+=(const Scalar& scalar)
+    { return *this = derived() + scalar; }
+    Derived& operator-=(const Scalar& scalar)
+    { return *this = derived() - scalar; }

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator+=(const ArrayBase<OtherDerived>& other);
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator-=(const ArrayBase<OtherDerived>& other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator*=(const ArrayBase<OtherDerived>& other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator/=(const ArrayBase<OtherDerived>& other);

  public:
-    EIGEN_DEVICE_FUNC
    ArrayBase<Derived>& array() { return *this; }
-    EIGEN_DEVICE_FUNC
    const ArrayBase<Derived>& array() const { return *this; }

    /** \returns an \link Eigen::MatrixBase Matrix \endlink expression of this array
      * \sa MatrixBase::array() */
-    EIGEN_DEVICE_FUNC
    MatrixWrapper<Derived> matrix() { return derived(); }
-    EIGEN_DEVICE_FUNC
    const MatrixWrapper<const Derived> matrix() const { return derived(); }

 //     template<typename Dest>
 //     inline void evalTo(Dest& dst) const { dst = matrix(); }

  protected:
-    EIGEN_DEVICE_FUNC
    ArrayBase() : Base() {}

  private:
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@@ -53,54 +53,41 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >

    typedef typename internal::nested<ExpressionType>::type NestedExpressionType;

-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
+    inline ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}

-    EIGEN_DEVICE_FUNC
    inline Index rows() const { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC
    inline Index cols() const { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC
    inline Index outerStride() const { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC
    inline Index innerStride() const { return m_expression.innerStride(); }

-    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
-    EIGEN_DEVICE_FUNC
    inline const Scalar* data() const { return m_expression.data(); }

-    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index rowId, Index colId) const
    {
      return m_expression.coeff(rowId, colId);
    }

-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index rowId, Index colId)
    {
      return m_expression.const_cast_derived().coeffRef(rowId, colId);
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
      return m_expression.const_cast_derived().coeffRef(rowId, colId);
    }

-    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index index) const
    {
      return m_expression.coeff(index);
    }

-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index index)
    {
      return m_expression.const_cast_derived().coeffRef(index);
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
      return m_expression.const_cast_derived().coeffRef(index);
@@ -131,11 +118,9 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
    }

    template<typename Dest>
-    EIGEN_DEVICE_FUNC
    inline void evalTo(Dest& dst) const { dst = m_expression; }

    const typename internal::remove_all<NestedExpressionType>::type& 
-    EIGEN_DEVICE_FUNC
    nestedExpression() const 
    {
      return m_expression;
@@ -143,11 +128,9 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >

    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index)  */
-    EIGEN_DEVICE_FUNC
    void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index,Index)*/
-    EIGEN_DEVICE_FUNC
    void resize(Index nbRows, Index nbCols) { m_expression.const_cast_derived().resize(nbRows,nbCols); }

  protected:
@@ -195,54 +178,41 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >

    typedef typename internal::nested<ExpressionType>::type NestedExpressionType;

-    EIGEN_DEVICE_FUNC
    inline MatrixWrapper(ExpressionType& a_matrix) : m_expression(a_matrix) {}

-    EIGEN_DEVICE_FUNC
    inline Index rows() const { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC
    inline Index cols() const { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC
    inline Index outerStride() const { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC
    inline Index innerStride() const { return m_expression.innerStride(); }

-    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
-    EIGEN_DEVICE_FUNC
    inline const Scalar* data() const { return m_expression.data(); }

-    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index rowId, Index colId) const
    {
      return m_expression.coeff(rowId, colId);
    }

-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index rowId, Index colId)
    {
      return m_expression.const_cast_derived().coeffRef(rowId, colId);
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
      return m_expression.derived().coeffRef(rowId, colId);
    }

-    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index index) const
    {
      return m_expression.coeff(index);
    }

-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index index)
    {
      return m_expression.const_cast_derived().coeffRef(index);
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
      return m_expression.const_cast_derived().coeffRef(index);
@@ -272,7 +242,6 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
      m_expression.const_cast_derived().template writePacket<LoadMode>(index, val);
    }

-    EIGEN_DEVICE_FUNC
    const typename internal::remove_all<NestedExpressionType>::type& 
    nestedExpression() const 
    {
@@ -281,11 +250,9 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >

    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index)  */
-    EIGEN_DEVICE_FUNC
    void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index,Index)*/
-    EIGEN_DEVICE_FUNC
    void resize(Index nbRows, Index nbCols) { m_expression.const_cast_derived().resize(nbRows,nbCols); }

  protected:
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -105,8 +105,6 @@ public:
    EIGEN_DEBUG_VAR(DstIsAligned)
    EIGEN_DEBUG_VAR(SrcIsAligned)
    EIGEN_DEBUG_VAR(JointAlignment)
-    EIGEN_DEBUG_VAR(Derived::SizeAtCompileTime)
-    EIGEN_DEBUG_VAR(OtherDerived::CoeffReadCost)
    EIGEN_DEBUG_VAR(InnerSize)
    EIGEN_DEBUG_VAR(InnerMaxSize)
    EIGEN_DEBUG_VAR(PacketSize)
@@ -141,7 +139,6 @@ struct assign_DefaultTraversal_CompleteUnrolling
    inner = Index % Derived1::InnerSizeAtCompileTime
  };

-  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
  {
    dst.copyCoeffByOuterInner(outer, inner, src);
@@ -152,14 +149,12 @@ struct assign_DefaultTraversal_CompleteUnrolling
 template<typename Derived1, typename Derived2, int Stop>
 struct assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
 {
-  EIGEN_DEVICE_FUNC 
  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
 };

 template<typename Derived1, typename Derived2, int Index, int Stop>
 struct assign_DefaultTraversal_InnerUnrolling
 {
-  EIGEN_DEVICE_FUNC 
  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src, typename Derived1::Index outer)
  {
    dst.copyCoeffByOuterInner(outer, Index, src);
@@ -170,7 +165,6 @@ struct assign_DefaultTraversal_InnerUnrolling
 template<typename Derived1, typename Derived2, int Stop>
 struct assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Stop, Stop>
 {
-  EIGEN_DEVICE_FUNC 
  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &, typename Derived1::Index) {}
 };

@@ -181,7 +175,6 @@ struct assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Stop, Stop>
 template<typename Derived1, typename Derived2, int Index, int Stop>
 struct assign_LinearTraversal_CompleteUnrolling
 {
-  EIGEN_DEVICE_FUNC 
  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
  {
    dst.copyCoeff(Index, src);
@@ -192,7 +185,6 @@ struct assign_LinearTraversal_CompleteUnrolling
 template<typename Derived1, typename Derived2, int Stop>
 struct assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
 {
-  EIGEN_DEVICE_FUNC 
  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
 };

@@ -257,7 +249,6 @@ struct assign_impl;
 template<typename Derived1, typename Derived2, int Unrolling, int Version>
 struct assign_impl<Derived1, Derived2, InvalidTraversal, Unrolling, Version>
 {
-  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &, const Derived2 &) { }
 };

@@ -265,7 +256,6 @@ template<typename Derived1, typename Derived2, int Version>
 struct assign_impl<Derived1, Derived2, DefaultTraversal, NoUnrolling, Version>
 {
  typedef typename Derived1::Index Index;
-  EIGEN_DEVICE_FUNC 
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    const Index innerSize = dst.innerSize();
@@ -279,7 +269,6 @@ struct assign_impl<Derived1, Derived2, DefaultTraversal, NoUnrolling, Version>
 template<typename Derived1, typename Derived2, int Version>
 struct assign_impl<Derived1, Derived2, DefaultTraversal, CompleteUnrolling, Version>
 {
-  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
  {
    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
@@ -291,7 +280,6 @@ template<typename Derived1, typename Derived2, int Version>
 struct assign_impl<Derived1, Derived2, DefaultTraversal, InnerUnrolling, Version>
 {
  typedef typename Derived1::Index Index;
-  EIGEN_DEVICE_FUNC 
  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
  {
    const Index outerSize = dst.outerSize();
@@ -309,7 +297,6 @@ template<typename Derived1, typename Derived2, int Version>
 struct assign_impl<Derived1, Derived2, LinearTraversal, NoUnrolling, Version>
 {
  typedef typename Derived1::Index Index;
-  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    const Index size = dst.size();
@@ -321,7 +308,6 @@ struct assign_impl<Derived1, Derived2, LinearTraversal, NoUnrolling, Version>
 template<typename Derived1, typename Derived2, int Version>
 struct assign_impl<Derived1, Derived2, LinearTraversal, CompleteUnrolling, Version>
 {
-  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
  {
    assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
@@ -453,19 +439,26 @@ struct assign_impl<Derived1, Derived2, SliceVectorizedTraversal, NoUnrolling, Ve
  typedef typename Derived1::Index Index;
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
-    typedef packet_traits<typename Derived1::Scalar> PacketTraits;
+    typedef typename Derived1::Scalar Scalar;
+    typedef packet_traits<Scalar> PacketTraits;
    enum {
      packetSize = PacketTraits::size,
      alignable = PacketTraits::AlignedOnScalar,
-      dstAlignment = alignable ? Aligned : int(assign_traits<Derived1,Derived2>::DstIsAligned) ,
+      dstIsAligned = assign_traits<Derived1,Derived2>::DstIsAligned,
+      dstAlignment = alignable ? Aligned : int(dstIsAligned),
      srcAlignment = assign_traits<Derived1,Derived2>::JointAlignment
    };
+    const Scalar *dst_ptr = &dst.coeffRef(0,0);
+    if((!bool(dstIsAligned)) && (size_t(dst_ptr) % sizeof(Scalar))>0)
+    {
+      // the pointer is not aligend-on scalar, so alignment is not possible
+      return assign_impl<Derived1,Derived2,DefaultTraversal,NoUnrolling>::run(dst, src);
+    }
    const Index packetAlignedMask = packetSize - 1;
    const Index innerSize = dst.innerSize();
    const Index outerSize = dst.outerSize();
    const Index alignedStep = alignable ? (packetSize - dst.outerStride() % packetSize) & packetAlignedMask : 0;
-    Index alignedStart = ((!alignable) || assign_traits<Derived1,Derived2>::DstIsAligned) ? 0
-                       : internal::first_aligned(&dst.coeffRef(0,0), innerSize);
+    Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned(dst_ptr, innerSize);

    for(Index outer = 0; outer < outerSize; ++outer)
    {
@@ -506,25 +499,12 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived,OtherDerived)
  EIGEN_STATIC_ASSERT(SameType,YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)

-#ifdef EIGEN_TEST_EVALUATORS
-  
-#ifdef EIGEN_DEBUG_ASSIGN
-  internal::copy_using_evaluator_traits<Derived, OtherDerived>::debug();
-#endif
-  eigen_assert(rows() == other.rows() && cols() == other.cols());
-  internal::call_dense_assignment_loop(derived(),other.derived());
-  
-#else // EIGEN_TEST_EVALUATORS
-
 #ifdef EIGEN_DEBUG_ASSIGN
  internal::assign_traits<Derived, OtherDerived>::debug();
 #endif
  eigen_assert(rows() == other.rows() && cols() == other.cols());
  internal::assign_impl<Derived, OtherDerived, int(SameType) ? int(internal::assign_traits<Derived, OtherDerived>::Traversal)
-                                                             : int(InvalidTraversal)>::run(derived(),other.derived());
-  
-#endif // EIGEN_TEST_EVALUATORS
-  
+                                                       : int(InvalidTraversal)>::run(derived(),other.derived());
 #ifndef EIGEN_NO_DEBUG
  checkTransposeAliasing(other.derived());
 #endif
@@ -544,28 +524,22 @@ struct assign_selector;

 template<typename Derived, typename OtherDerived>
 struct assign_selector<Derived,OtherDerived,false,false> {
-  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.derived()); }
  template<typename ActualDerived, typename ActualOtherDerived>
-  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Derived& evalTo(ActualDerived& dst, const ActualOtherDerived& other) { other.evalTo(dst); return dst; }
 };
 template<typename Derived, typename OtherDerived>
 struct assign_selector<Derived,OtherDerived,true,false> {
-  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.eval()); }
 };
 template<typename Derived, typename OtherDerived>
 struct assign_selector<Derived,OtherDerived,false,true> {
-  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose()); }
  template<typename ActualDerived, typename ActualOtherDerived>
-  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Derived& evalTo(ActualDerived& dst, const ActualOtherDerived& other) { Transpose<ActualDerived> dstTrans(dst); other.evalTo(dstTrans); return dst; }
 };
 template<typename Derived, typename OtherDerived>
 struct assign_selector<Derived,OtherDerived,true,true> {
-  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose().eval()); }
 };

@@ -573,21 +547,18 @@ struct assign_selector<Derived,OtherDerived,true,true> {

 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase<OtherDerived>& other)
 {
  return internal::assign_selector<Derived,OtherDerived>::run(derived(), other.derived());
 }

 template<typename Derived>
-EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase& other)
 {
  return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
 }

 template<typename Derived>
-EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const MatrixBase& other)
 {
  return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
@@ -595,7 +566,6 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const MatrixBase& ot

 template<typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const DenseBase<OtherDerived>& other)
 {
  return internal::assign_selector<Derived,OtherDerived>::run(derived(), other.derived());
@@ -603,7 +573,6 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const DenseBase<Othe

 template<typename Derived>
 template <typename OtherDerived>
-EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const EigenBase<OtherDerived>& other)
 {
  return internal::assign_selector<Derived,OtherDerived,false>::evalTo(derived(), other.derived());
@@ -611,7 +580,6 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const EigenBase<Othe

 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
 {
  return internal::assign_selector<Derived,OtherDerived,false>::evalTo(derived(), other.derived());
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -1,842 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2011-2013 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2011-2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_ASSIGN_EVALUATOR_H
-#define EIGEN_ASSIGN_EVALUATOR_H
-
-namespace Eigen {
-
-// This implementation is based on Assign.h
-
-namespace internal {
-  
-/***************************************************************************
-* Part 1 : the logic deciding a strategy for traversal and unrolling       *
-***************************************************************************/
-
-// copy_using_evaluator_traits is based on assign_traits
-
-template <typename Derived, typename OtherDerived>
-struct copy_using_evaluator_traits
-{
-public:
-  enum {
-    DstIsAligned = Derived::Flags & AlignedBit,
-    DstHasDirectAccess = Derived::Flags & DirectAccessBit,
-    SrcIsAligned = OtherDerived::Flags & AlignedBit,
-    JointAlignment = bool(DstIsAligned) && bool(SrcIsAligned) ? Aligned : Unaligned,
-    SrcEvalBeforeAssign = (evaluator_traits<OtherDerived>::HasEvalTo == 1)
-  };
-
-private:
-  enum {
-    InnerSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::SizeAtCompileTime)
-              : int(Derived::Flags)&RowMajorBit ? int(Derived::ColsAtCompileTime)
-              : int(Derived::RowsAtCompileTime),
-    InnerMaxSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::MaxSizeAtCompileTime)
-              : int(Derived::Flags)&RowMajorBit ? int(Derived::MaxColsAtCompileTime)
-              : int(Derived::MaxRowsAtCompileTime),
-    MaxSizeAtCompileTime = Derived::SizeAtCompileTime,
-    PacketSize = packet_traits<typename Derived::Scalar>::size
-  };
-
-  enum {
-    StorageOrdersAgree = (int(Derived::IsRowMajor) == int(OtherDerived::IsRowMajor)),
-    MightVectorize = StorageOrdersAgree
-                  && (int(Derived::Flags) & int(OtherDerived::Flags) & ActualPacketAccessBit),
-    MayInnerVectorize  = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
-                       && int(DstIsAligned) && int(SrcIsAligned),
-    MayLinearize = StorageOrdersAgree && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
-    MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
-                       && (DstIsAligned || MaxSizeAtCompileTime == Dynamic),
-      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
-         so it's only good for large enough sizes. */
-    MaySliceVectorize  = MightVectorize && DstHasDirectAccess
-                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*PacketSize)
-      /* slice vectorization can be slow, so we only want it if the slices are big, which is
-         indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
-         in a fixed-size matrix */
-  };
-
-public:
-  enum {
-    Traversal = int(SrcEvalBeforeAssign) ? int(AllAtOnceTraversal) 
-              : int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
-              : int(MayLinearVectorize)  ? int(LinearVectorizedTraversal)
-              : int(MaySliceVectorize)   ? int(SliceVectorizedTraversal)
-              : int(MayLinearize)        ? int(LinearTraversal)
-                                         : int(DefaultTraversal),
-    Vectorized = int(Traversal) == InnerVectorizedTraversal
-              || int(Traversal) == LinearVectorizedTraversal
-              || int(Traversal) == SliceVectorizedTraversal
-  };
-
-private:
-  enum {
-    UnrollingLimit      = EIGEN_UNROLLING_LIMIT * (Vectorized ? int(PacketSize) : 1),
-    MayUnrollCompletely = int(Derived::SizeAtCompileTime) != Dynamic
-                       && int(OtherDerived::CoeffReadCost) != Dynamic
-                       && int(Derived::SizeAtCompileTime) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit),
-    MayUnrollInner      = int(InnerSize) != Dynamic
-                       && int(OtherDerived::CoeffReadCost) != Dynamic
-                       && int(InnerSize) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit)
-  };
-
-public:
-  enum {
-    Unrolling = (int(Traversal) == int(InnerVectorizedTraversal) || int(Traversal) == int(DefaultTraversal))
-                ? (
-                    int(MayUnrollCompletely) ? int(CompleteUnrolling)
-                  : int(MayUnrollInner)      ? int(InnerUnrolling)
-                                             : int(NoUnrolling)
-                  )
-              : int(Traversal) == int(LinearVectorizedTraversal)
-                ? ( bool(MayUnrollCompletely) && bool(DstIsAligned) ? int(CompleteUnrolling) 
-                                                                    : int(NoUnrolling) )
-              : int(Traversal) == int(LinearTraversal)
-                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) 
-                                              : int(NoUnrolling) )
-              : int(NoUnrolling)
-  };
-
-#ifdef EIGEN_DEBUG_ASSIGN
-  static void debug()
-  {
-    EIGEN_DEBUG_VAR(DstIsAligned)
-    EIGEN_DEBUG_VAR(SrcIsAligned)
-    EIGEN_DEBUG_VAR(JointAlignment)
-    EIGEN_DEBUG_VAR(InnerSize)
-    EIGEN_DEBUG_VAR(InnerMaxSize)
-    EIGEN_DEBUG_VAR(PacketSize)
-    EIGEN_DEBUG_VAR(StorageOrdersAgree)
-    EIGEN_DEBUG_VAR(MightVectorize)
-    EIGEN_DEBUG_VAR(MayLinearize)
-    EIGEN_DEBUG_VAR(MayInnerVectorize)
-    EIGEN_DEBUG_VAR(MayLinearVectorize)
-    EIGEN_DEBUG_VAR(MaySliceVectorize)
-    EIGEN_DEBUG_VAR(Traversal)
-    EIGEN_DEBUG_VAR(UnrollingLimit)
-    EIGEN_DEBUG_VAR(MayUnrollCompletely)
-    EIGEN_DEBUG_VAR(MayUnrollInner)
-    EIGEN_DEBUG_VAR(Unrolling)
-  }
-#endif
-};
-
-/***************************************************************************
-* Part 2 : meta-unrollers
-***************************************************************************/
-
-/************************
-*** Default traversal ***
-************************/
-
-template<typename Kernel, int Index, int Stop>
-struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling
-{
-  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
-  typedef typename DstEvaluatorType::XprType DstXprType;
-  
-  enum {
-    outer = Index / DstXprType::InnerSizeAtCompileTime,
-    inner = Index % DstXprType::InnerSizeAtCompileTime
-  };
-
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
-  {
-    kernel.assignCoeffByOuterInner(outer, inner);
-    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
-  }
-};
-
-template<typename Kernel, int Stop>
-struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Kernel&) { }
-};
-
-template<typename Kernel, int Index, int Stop>
-struct copy_using_evaluator_DefaultTraversal_InnerUnrolling
-{
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel, int outer)
-  {
-    kernel.assignCoeffByOuterInner(outer, Index);
-    copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Index+1, Stop>::run(kernel, outer);
-  }
-};
-
-template<typename Kernel, int Stop>
-struct copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Kernel&, int) { }
-};
-
-/***********************
-*** Linear traversal ***
-***********************/
-
-template<typename Kernel, int Index, int Stop>
-struct copy_using_evaluator_LinearTraversal_CompleteUnrolling
-{
-  static EIGEN_STRONG_INLINE void run(Kernel& kernel)
-  {
-    kernel.assignCoeff(Index);
-    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
-  }
-};
-
-template<typename Kernel, int Stop>
-struct copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Kernel&) { }
-};
-
-/**************************
-*** Inner vectorization ***
-**************************/
-
-template<typename Kernel, int Index, int Stop>
-struct copy_using_evaluator_innervec_CompleteUnrolling
-{
-  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
-  typedef typename DstEvaluatorType::XprType DstXprType;
-
-  enum {
-    outer = Index / DstXprType::InnerSizeAtCompileTime,
-    inner = Index % DstXprType::InnerSizeAtCompileTime,
-    JointAlignment = Kernel::AssignmentTraits::JointAlignment
-  };
-
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
-  {
-    kernel.template assignPacketByOuterInner<Aligned, JointAlignment>(outer, inner);
-    enum { NextIndex = Index + packet_traits<typename DstXprType::Scalar>::size };
-    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel);
-  }
-};
-
-template<typename Kernel, int Stop>
-struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Kernel&) { }
-};
-
-template<typename Kernel, int Index, int Stop>
-struct copy_using_evaluator_innervec_InnerUnrolling
-{
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel, int outer)
-  {
-    kernel.template assignPacketByOuterInner<Aligned, Aligned>(outer, Index);
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-    enum { NextIndex = Index + packet_traits<typename DstXprType::Scalar>::size };
-    copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop>::run(kernel, outer);
-  }
-};
-
-template<typename Kernel, int Stop>
-struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Kernel &, int) { }
-};
-
-/***************************************************************************
-* Part 3 : implementation of all cases
-***************************************************************************/
-
-// dense_assignment_loop is based on assign_impl
-
-template<typename Kernel,
-         int Traversal = Kernel::AssignmentTraits::Traversal,
-         int Unrolling = Kernel::AssignmentTraits::Unrolling>
-struct dense_assignment_loop;
-
-/************************
-*** Default traversal ***
-************************/
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>
-{
-  static void run(Kernel &kernel)
-  {
-    typedef typename Kernel::Index Index;
-    
-    for(Index outer = 0; outer < kernel.outerSize(); ++outer) {
-      for(Index inner = 0; inner < kernel.innerSize(); ++inner) {
-        kernel.assignCoeffByOuterInner(outer, inner);
-      }
-    }
-  }
-};
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, DefaultTraversal, CompleteUnrolling>
-{
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
-  {
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
-  }
-};
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, DefaultTraversal, InnerUnrolling>
-{
-  typedef typename Kernel::Index Index;
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
-  {
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-
-    const Index outerSize = kernel.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer)
-      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel, outer);
-  }
-};
-
-/***************************
-*** Linear vectorization ***
-***************************/
-
-
-// The goal of unaligned_dense_assignment_loop is simply to factorize the handling
-// of the non vectorizable beginning and ending parts
-
-template <bool IsAligned = false>
-struct unaligned_dense_assignment_loop
-{
-  // if IsAligned = true, then do nothing
-  template <typename Kernel>
-  static EIGEN_STRONG_INLINE void run(Kernel&, typename Kernel::Index, typename Kernel::Index) {}
-};
-
-template <>
-struct unaligned_dense_assignment_loop<false>
-{
-  // MSVC must not inline this functions. If it does, it fails to optimize the
-  // packet access path.
-  // FIXME check which version exhibits this issue
-#ifdef _MSC_VER
-  template <typename Kernel>
-  static EIGEN_DONT_INLINE void run(Kernel &kernel,
-                                    typename Kernel::Index start,
-                                    typename Kernel::Index end)
-#else
-  template <typename Kernel>
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel,
-                                      typename Kernel::Index start,
-                                      typename Kernel::Index end)
-#endif
-  {
-    for (typename Kernel::Index index = start; index < end; ++index)
-      kernel.assignCoeff(index);
-  }
-};
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
-{
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
-  {
-    typedef typename Kernel::Index Index;
-
-    const Index size = kernel.size();
-    typedef packet_traits<typename Kernel::Scalar> PacketTraits;
-    enum {
-      packetSize = PacketTraits::size,
-      dstIsAligned = int(Kernel::AssignmentTraits::DstIsAligned),
-      dstAlignment = PacketTraits::AlignedOnScalar ? Aligned : dstIsAligned,
-      srcAlignment = Kernel::AssignmentTraits::JointAlignment
-    };
-    const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned(&kernel.dstEvaluator().coeffRef(0), size);
-    const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
-
-    unaligned_dense_assignment_loop<dstIsAligned!=0>::run(kernel, 0, alignedStart);
-
-    for(Index index = alignedStart; index < alignedEnd; index += packetSize)
-      kernel.template assignPacket<dstAlignment, srcAlignment>(index);
-
-    unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
-  }
-};
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrolling>
-{
-  typedef typename Kernel::Index Index;
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
-  {
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-    
-    enum { size = DstXprType::SizeAtCompileTime,
-           packetSize = packet_traits<typename Kernel::Scalar>::size,
-           alignedSize = (size/packetSize)*packetSize };
-
-    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, alignedSize>::run(kernel);
-    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, alignedSize, size>::run(kernel);
-  }
-};
-
-/**************************
-*** Inner vectorization ***
-**************************/
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling>
-{
-  static inline void run(Kernel &kernel)
-  {
-    typedef typename Kernel::Index Index;
-
-    const Index innerSize = kernel.innerSize();
-    const Index outerSize = kernel.outerSize();
-    const Index packetSize = packet_traits<typename Kernel::Scalar>::size;
-    for(Index outer = 0; outer < outerSize; ++outer)
-      for(Index inner = 0; inner < innerSize; inner+=packetSize)
-        kernel.template assignPacketByOuterInner<Aligned, Aligned>(outer, inner);
-  }
-};
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, CompleteUnrolling>
-{
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
-  {
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
-  }
-};
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling>
-{
-  typedef typename Kernel::Index Index;
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
-  {
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-    const Index outerSize = kernel.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer)
-      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel, outer);
-  }
-};
-
-/***********************
-*** Linear traversal ***
-***********************/
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, LinearTraversal, NoUnrolling>
-{
-  static inline void run(Kernel &kernel)
-  {
-    typedef typename Kernel::Index Index;
-    const Index size = kernel.size();
-    for(Index i = 0; i < size; ++i)
-      kernel.assignCoeff(i);
-  }
-};
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling>
-{
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
-  {
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
-  }
-};
-
-/**************************
-*** Slice vectorization ***
-***************************/
-
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
-{
-  static inline void run(Kernel &kernel)
-  {
-    typedef typename Kernel::Index Index;
-    typedef packet_traits<typename Kernel::Scalar> PacketTraits;
-    enum {
-      packetSize = PacketTraits::size,
-      alignable = PacketTraits::AlignedOnScalar,
-      dstAlignment = alignable ? Aligned : int(Kernel::AssignmentTraits::DstIsAligned)
-    };
-    const Index packetAlignedMask = packetSize - 1;
-    const Index innerSize = kernel.innerSize();
-    const Index outerSize = kernel.outerSize();
-    const Index alignedStep = alignable ? (packetSize - kernel.outerStride() % packetSize) & packetAlignedMask : 0;
-    Index alignedStart = ((!alignable) || Kernel::AssignmentTraits::DstIsAligned) ? 0
-                       : internal::first_aligned(&kernel.dstEvaluator().coeffRef(0,0), innerSize);
-
-    for(Index outer = 0; outer < outerSize; ++outer)
-    {
-      const Index alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask);
-      // do the non-vectorizable part of the assignment
-      for(Index inner = 0; inner<alignedStart ; ++inner)
-        kernel.assignCoeffByOuterInner(outer, inner);
-
-      // do the vectorizable part of the assignment
-      for(Index inner = alignedStart; inner<alignedEnd; inner+=packetSize)
-        kernel.template assignPacketByOuterInner<dstAlignment, Unaligned>(outer, inner);
-
-      // do the non-vectorizable part of the assignment
-      for(Index inner = alignedEnd; inner<innerSize ; ++inner)
-        kernel.assignCoeffByOuterInner(outer, inner);
-
-      alignedStart = std::min<Index>((alignedStart+alignedStep)%packetSize, innerSize);
-    }
-  }
-};
-
-/****************************
-*** All-at-once traversal ***
-****************************/
-
-// TODO: this 'AllAtOnceTraversal' should be dropped or caught earlier (Gael)
-// Indeed, what to do with the kernel's functor??
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, AllAtOnceTraversal, NoUnrolling>
-{
-  static inline void run(Kernel & kernel)
-  {
-    // Evaluate rhs in temporary to prevent aliasing problems in a = a * a;
-    // TODO: Do not pass the xpr object to evalTo() (Jitse)
-    kernel.srcEvaluator().evalTo(kernel.dstEvaluator(), kernel.dstExpression());
-  }
-};
-
-/***************************************************************************
-* Part 4 : Generic Assignment routine
-***************************************************************************/
-
-// This class generalize the assignment of a coefficient (or packet) from one dense evaluator
-// to another dense writable evaluator.
-// It is parametrized by the two evaluators, and the actual assignment functor.
-// This abstraction level permits to keep the evaluation loops as simple and as generic as possible.
-// One can customize the assignment using this generic dense_assignment_kernel with different
-// functors, or by completely overloading it, by-passing a functor.
-template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor>
-class generic_dense_assignment_kernel
-{
-protected:
-  typedef typename DstEvaluatorTypeT::XprType DstXprType;
-  typedef typename SrcEvaluatorTypeT::XprType SrcXprType;
-public:
-  
-  typedef DstEvaluatorTypeT DstEvaluatorType;
-  typedef SrcEvaluatorTypeT SrcEvaluatorType;
-  typedef typename DstEvaluatorType::Scalar Scalar;
-  typedef typename DstEvaluatorType::Index Index;
-  typedef copy_using_evaluator_traits<DstXprType, SrcXprType> AssignmentTraits;
-  
-  
-  generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
-    : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr)
-  {}
-  
-  Index size() const        { return m_dstExpr.size(); }
-  Index innerSize() const   { return m_dstExpr.innerSize(); }
-  Index outerSize() const   { return m_dstExpr.outerSize(); }
-  Index outerStride() const { return m_dstExpr.outerStride(); }
-  
-  // TODO get rid of this one:
-  DstXprType& dstExpression() const { return m_dstExpr; }
-  
-  DstEvaluatorType& dstEvaluator() { return m_dst; }
-  const SrcEvaluatorType& srcEvaluator() const { return m_src; }
-  
-  void assignCoeff(Index row, Index col)
-  {
-    m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col));
-  }
-  
-  void assignCoeff(Index index)
-  {
-    m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index));
-  }
-  
-  void assignCoeffByOuterInner(Index outer, Index inner)
-  {
-    Index row = rowIndexByOuterInner(outer, inner); 
-    Index col = colIndexByOuterInner(outer, inner); 
-    assignCoeff(row, col);
-  }
-  
-  
-  template<int StoreMode, int LoadMode>
-  void assignPacket(Index row, Index col)
-  {
-    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode>(row,col));
-  }
-  
-  template<int StoreMode, int LoadMode>
-  void assignPacket(Index index)
-  {
-    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode>(index));
-  }
-  
-  template<int StoreMode, int LoadMode>
-  void assignPacketByOuterInner(Index outer, Index inner)
-  {
-    Index row = rowIndexByOuterInner(outer, inner); 
-    Index col = colIndexByOuterInner(outer, inner);
-    assignPacket<StoreMode,LoadMode>(row, col);
-  }
-  
-  static Index rowIndexByOuterInner(Index outer, Index inner)
-  {
-    typedef typename DstEvaluatorType::ExpressionTraits Traits;
-    return int(Traits::RowsAtCompileTime) == 1 ? 0
-      : int(Traits::ColsAtCompileTime) == 1 ? inner
-      : int(Traits::Flags)&RowMajorBit ? outer
-      : inner;
-  }
-
-  static Index colIndexByOuterInner(Index outer, Index inner)
-  {
-    typedef typename DstEvaluatorType::ExpressionTraits Traits;
-    return int(Traits::ColsAtCompileTime) == 1 ? 0
-      : int(Traits::RowsAtCompileTime) == 1 ? inner
-      : int(Traits::Flags)&RowMajorBit ? inner
-      : outer;
-  }
-  
-protected:
-  DstEvaluatorType& m_dst;
-  const SrcEvaluatorType& m_src;
-  const Functor &m_functor;
-  // TODO find a way to avoid the needs of the original expression
-  DstXprType& m_dstExpr;
-};
-
-template<typename DstXprType, typename SrcXprType, typename Functor>
-void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func)
-{
-#ifdef EIGEN_DEBUG_ASSIGN
-  // TODO these traits should be computed from information provided by the evaluators
-  internal::copy_using_evaluator_traits<DstXprType, SrcXprType>::debug();
-#endif
-  eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
-  
-  typedef typename evaluator<DstXprType>::type DstEvaluatorType;
-  typedef typename evaluator<SrcXprType>::type SrcEvaluatorType;
-
-  DstEvaluatorType dstEvaluator(dst);
-  SrcEvaluatorType srcEvaluator(src);
-    
-  typedef generic_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
-  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
-  
-  dense_assignment_loop<Kernel>::run(kernel);
-}
-
-template<typename DstXprType, typename SrcXprType>
-void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src)
-{
-  call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar>());
-}
-
-/***************************************************************************
-* Part 5 : Entry points
-***************************************************************************/
-
-// Based on DenseBase::LazyAssign()
-// The following functions are just for testing and they are meant to be moved to operator= and the likes.
-
-template<typename DstXprType, template <typename> class StorageBase, typename SrcXprType>
-EIGEN_STRONG_INLINE
-const DstXprType& copy_using_evaluator(const NoAlias<DstXprType, StorageBase>& dst, 
-                                       const EigenBase<SrcXprType>& src)
-{
-  return noalias_copy_using_evaluator(dst.expression(), src.derived(), internal::assign_op<typename DstXprType::Scalar>());
-}
-
-template<typename XprType, int AssumeAliasing = evaluator_traits<XprType>::AssumeAliasing>
-struct AddEvalIfAssumingAliasing;
-
-template<typename XprType>
-struct AddEvalIfAssumingAliasing<XprType, 0>
-{
-  static const XprType& run(const XprType& xpr) 
-  {
-    return xpr;
-  }
-};
-
-template<typename XprType>
-struct AddEvalIfAssumingAliasing<XprType, 1>
-{
-  static const EvalToTemp<XprType> run(const XprType& xpr)
-  {
-    return EvalToTemp<XprType>(xpr);
-  }
-};
-
-template<typename DstXprType, typename SrcXprType, typename Functor>
-EIGEN_STRONG_INLINE
-const DstXprType& copy_using_evaluator(const EigenBase<DstXprType>& dst, const EigenBase<SrcXprType>& src, const Functor &func)
-{
-  return noalias_copy_using_evaluator(dst.const_cast_derived(), 
-                                      AddEvalIfAssumingAliasing<SrcXprType>::run(src.derived()),
-                                      func
-                                     );
-}
-
-// this mimics operator=
-template<typename DstXprType, typename SrcXprType>
-EIGEN_STRONG_INLINE
-const DstXprType& copy_using_evaluator(const EigenBase<DstXprType>& dst, const EigenBase<SrcXprType>& src)
-{
-  return copy_using_evaluator(dst.const_cast_derived(), src.derived(), internal::assign_op<typename DstXprType::Scalar>());
-}
-
-template<typename DstXprType, typename SrcXprType, typename Functor>
-EIGEN_STRONG_INLINE
-const DstXprType& noalias_copy_using_evaluator(const PlainObjectBase<DstXprType>& dst, const EigenBase<SrcXprType>& src, const Functor &func)
-{
-#ifdef EIGEN_DEBUG_ASSIGN
-  internal::copy_using_evaluator_traits<DstXprType, SrcXprType>::debug();
-#endif
-#ifdef EIGEN_NO_AUTOMATIC_RESIZING
-  eigen_assert((dst.size()==0 || (IsVectorAtCompileTime ? (dst.size() == src.size())
-                                                        : (dst.rows() == src.rows() && dst.cols() == src.cols())))
-              && "Size mismatch. Automatic resizing is disabled because EIGEN_NO_AUTOMATIC_RESIZING is defined");
-#else
-  dst.const_cast_derived().resizeLike(src.derived());
-#endif
-  call_dense_assignment_loop(dst.const_cast_derived(), src.derived(), func);
-  return dst.derived();
-}
-
-template<typename DstXprType, typename SrcXprType, typename Functor>
-EIGEN_STRONG_INLINE
-const DstXprType& noalias_copy_using_evaluator(const EigenBase<DstXprType>& dst, const EigenBase<SrcXprType>& src, const Functor &func)
-{
-  call_dense_assignment_loop(dst.const_cast_derived(), src.derived(), func);
-  return dst.derived();
-}
-
-// Based on DenseBase::swap()
-// TODO: Check whether we need to do something special for swapping two
-//       Arrays or Matrices. (Jitse)
-
-// Overload default assignPacket behavior for swapping them
-template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT>
-class swap_kernel : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar> >
-{
-  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar> > Base;
-  typedef typename DstEvaluatorTypeT::PacketScalar PacketScalar;
-  using Base::m_dst;
-  using Base::m_src;
-  using Base::m_functor;
-  
-public:
-  typedef typename Base::Scalar Scalar;
-  typedef typename Base::Index Index;
-  typedef typename Base::DstXprType DstXprType;
-  
-  swap_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, DstXprType& dstExpr)
-    : Base(dst, src, swap_assign_op<Scalar>(), dstExpr)
-  {}
-  
-  template<int StoreMode, int LoadMode>
-  void assignPacket(Index row, Index col)
-  {
-    m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(row,col), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(row,col));
-  }
-  
-  template<int StoreMode, int LoadMode>
-  void assignPacket(Index index)
-  {
-    m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(index), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(index));
-  }
-  
-  // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael)
-  template<int StoreMode, int LoadMode>
-  void assignPacketByOuterInner(Index outer, Index inner)
-  {
-    Index row = Base::rowIndexByOuterInner(outer, inner); 
-    Index col = Base::colIndexByOuterInner(outer, inner);
-    assignPacket<StoreMode,LoadMode>(row, col);
-  }
-};
-  
-template<typename DstXprType, typename SrcXprType>
-void swap_using_evaluator(const DstXprType& dst, const SrcXprType& src)
-{
-  // TODO there is too much redundancy with call_dense_assignment_loop
-  
-  eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
-  
-  typedef typename evaluator<DstXprType>::type DstEvaluatorType;
-  typedef typename evaluator<SrcXprType>::type SrcEvaluatorType;
-
-  DstEvaluatorType dstEvaluator(dst);
-  SrcEvaluatorType srcEvaluator(src);
-    
-  typedef swap_kernel<DstEvaluatorType,SrcEvaluatorType> Kernel;
-  Kernel kernel(dstEvaluator, srcEvaluator, dst.const_cast_derived());
-  
-  dense_assignment_loop<Kernel>::run(kernel);
-}
-
-// Based on MatrixBase::operator+= (in CwiseBinaryOp.h)
-template<typename DstXprType, typename SrcXprType>
-void add_assign_using_evaluator(const MatrixBase<DstXprType>& dst, const MatrixBase<SrcXprType>& src)
-{
-  typedef typename DstXprType::Scalar Scalar;
-  copy_using_evaluator(dst.derived(), src.derived(), add_assign_op<Scalar>());
-}
-
-// Based on ArrayBase::operator+=
-template<typename DstXprType, typename SrcXprType>
-void add_assign_using_evaluator(const ArrayBase<DstXprType>& dst, const ArrayBase<SrcXprType>& src)
-{
-  typedef typename DstXprType::Scalar Scalar;
-  copy_using_evaluator(dst.derived(), src.derived(), add_assign_op<Scalar>());
-}
-
-// TODO: Add add_assign_using_evaluator for EigenBase ? (Jitse)
-
-template<typename DstXprType, typename SrcXprType>
-void subtract_assign_using_evaluator(const MatrixBase<DstXprType>& dst, const MatrixBase<SrcXprType>& src)
-{
-  typedef typename DstXprType::Scalar Scalar;
-  copy_using_evaluator(dst.derived(), src.derived(), sub_assign_op<Scalar>());
-}
-
-template<typename DstXprType, typename SrcXprType>
-void subtract_assign_using_evaluator(const ArrayBase<DstXprType>& dst, const ArrayBase<SrcXprType>& src)
-{
-  typedef typename DstXprType::Scalar Scalar;
-  copy_using_evaluator(dst.derived(), src.derived(), sub_assign_op<Scalar>());
-}
-
-template<typename DstXprType, typename SrcXprType>
-void multiply_assign_using_evaluator(const ArrayBase<DstXprType>& dst, const ArrayBase<SrcXprType>& src)
-{
-  typedef typename DstXprType::Scalar Scalar;
-  copy_using_evaluator(dst.derived(), src.derived(), mul_assign_op<Scalar>());
-}
-
-template<typename DstXprType, typename SrcXprType>
-void divide_assign_using_evaluator(const ArrayBase<DstXprType>& dst, const ArrayBase<SrcXprType>& src)
-{
-  typedef typename DstXprType::Scalar Scalar;
-  copy_using_evaluator(dst.derived(), src.derived(), div_assign_op<Scalar>());
-}
-
-
-} // namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_ASSIGN_EVALUATOR_H
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@@ -202,7 +202,6 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(asin, Asin)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(cos,  Cos)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(acos, Acos)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(tan,  Tan)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(atan,  Atan)
 //EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs,  Abs)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(exp,  Exp)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(log,  Ln)
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -21,9 +21,6 @@ namespace Eigen {
  * \param XprType the type of the expression in which we are taking a block
  * \param BlockRows the number of rows of the block we are taking at compile time (optional)
  * \param BlockCols the number of columns of the block we are taking at compile time (optional)
-  * \param InnerPanel is true, if the block maps to a set of rows of a row major matrix or
-  *        to set of columns of a column major matrix (optional). The parameter allows to determine
-  *        at compile time whether aligned access is possible on the block expression.
  *
  * This class represents an expression of either a fixed-size or dynamic-size block. It is the return
  * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and
@@ -69,8 +66,9 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
                         : ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime)
                         : int(traits<XprType>::MaxColsAtCompileTime),
    XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0,
-    IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
-               : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
+    IsDense = is_same<StorageKind,Dense>::value,
+    IsRowMajor = (IsDense&&MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
+               : (IsDense&&MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
               : XprTypeIsRowMajor,
    HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor),
    InnerSize = IsRowMajor ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
@@ -83,7 +81,7 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
    MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
                       && (InnerStrideAtCompileTime == 1)
                        ? PacketAccessBit : 0,
-    MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) ? AlignedBit : 0,
+    MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0,
    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (traits<XprType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,
    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
    FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
@@ -114,7 +112,6 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
  
    /** Column or Row constructor
      */
-    EIGEN_DEVICE_FUNC
    inline Block(XprType& xpr, Index i) : Impl(xpr,i)
    {
      eigen_assert( (i>=0) && (
@@ -124,7 +121,6 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class

    /** Fixed-size constructor
      */
-    EIGEN_DEVICE_FUNC
    inline Block(XprType& xpr, Index a_startRow, Index a_startCol)
      : Impl(xpr, a_startRow, a_startCol)
    {
@@ -135,7 +131,6 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class

    /** Dynamic-size constructor
      */
-    EIGEN_DEVICE_FUNC
    inline Block(XprType& xpr,
          Index a_startRow, Index a_startCol,
          Index blockRows, Index blockCols)
@@ -159,9 +154,8 @@ class BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, Dense>
  public:
    typedef Impl Base;
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
-    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
-    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol) : Impl(xpr, a_startRow, a_startCol) {}
-    EIGEN_DEVICE_FUNC
+    inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
+    inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol) : Impl(xpr, a_startRow, a_startCol) {}
    inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol, Index blockRows, Index blockCols)
      : Impl(xpr, a_startRow, a_startCol, blockRows, blockCols) {}
 };
@@ -183,7 +177,6 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H

    /** Column or Row constructor
      */
-    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr, Index i)
      : m_xpr(xpr),
        // It is a row if and only if BlockRows==1 and BlockCols==XprType::ColsAtCompileTime,
@@ -198,7 +191,6 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H

    /** Fixed-size constructor
      */
-    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr, Index a_startRow, Index a_startCol)
      : m_xpr(xpr), m_startRow(a_startRow), m_startCol(a_startCol),
                    m_blockRows(BlockRows), m_blockCols(BlockCols)
@@ -206,7 +198,6 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H

    /** Dynamic-size constructor
      */
-    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr,
          Index a_startRow, Index a_startCol,
          Index blockRows, Index blockCols)
@@ -214,10 +205,9 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
                    m_blockRows(blockRows), m_blockCols(blockCols)
    {}

-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_blockRows.value(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_blockCols.value(); }
+    inline Index rows() const { return m_blockRows.value(); }
+    inline Index cols() const { return m_blockCols.value(); }

-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index rowId, Index colId)
    {
      EIGEN_STATIC_ASSERT_LVALUE(XprType)
@@ -225,20 +215,17 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
               .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
      return m_xpr.derived()
               .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const
    {
      return m_xpr.coeff(rowId + m_startRow.value(), colId + m_startCol.value());
    }

-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index index)
    {
      EIGEN_STATIC_ASSERT_LVALUE(XprType)
@@ -247,7 +234,6 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
      return m_xpr.const_cast_derived()
@@ -255,7 +241,6 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
    }

-    EIGEN_DEVICE_FUNC
    inline const CoeffReturnType coeff(Index index) const
    {
      return m_xpr
@@ -295,24 +280,21 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H

    #ifdef EIGEN_PARSED_BY_DOXYGEN
    /** \sa MapBase::data() */
-    EIGEN_DEVICE_FUNC inline const Scalar* data() const;
-    EIGEN_DEVICE_FUNC inline Index innerStride() const;
-    EIGEN_DEVICE_FUNC inline Index outerStride() const;
+    inline const Scalar* data() const;
+    inline Index innerStride() const;
+    inline Index outerStride() const;
    #endif

-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const
+    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const 
    { 
      return m_xpr; 
    }
      
-    EIGEN_DEVICE_FUNC
-    Index startRow() const
+    Index startRow() const 
    { 
      return m_startRow.value(); 
    }
      
-    EIGEN_DEVICE_FUNC
    Index startCol() const 
    { 
      return m_startCol.value(); 
@@ -341,7 +323,6 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>

    /** Column or Row constructor
      */
-    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr, Index i)
      : Base(internal::const_cast_ptr(&xpr.coeffRef(
              (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0,
@@ -355,7 +336,6 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>

    /** Fixed-size constructor
      */
-    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
      : Base(internal::const_cast_ptr(&xpr.coeffRef(startRow,startCol))), m_xpr(xpr)
    {
@@ -364,7 +344,6 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>

    /** Dynamic-size constructor
      */
-    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr,
          Index startRow, Index startCol,
          Index blockRows, Index blockCols)
@@ -374,14 +353,12 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
      init();
    }

-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const
+    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const 
    { 
      return m_xpr; 
    }
      
    /** \sa MapBase::innerStride() */
-    EIGEN_DEVICE_FUNC
    inline Index innerStride() const
    {
      return internal::traits<BlockType>::HasSameStorageOrderAsXprType
@@ -390,7 +367,6 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
    }

    /** \sa MapBase::outerStride() */
-    EIGEN_DEVICE_FUNC
    inline Index outerStride() const
    {
      return m_outerStride;
@@ -404,7 +380,6 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>

    #ifndef EIGEN_PARSED_BY_DOXYGEN
    /** \internal used by allowAligned() */
-    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols)
      : Base(data, blockRows, blockCols), m_xpr(xpr)
    {
@@ -413,7 +388,6 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
    #endif

  protected:
-    EIGEN_DEVICE_FUNC
    void init()
    {
      m_outerStride = internal::traits<BlockType>::HasSameStorageOrderAsXprType
--- a/Eigen/src/Core/CMakeLists.txt
+++ b/Eigen/src/Core/CMakeLists.txt
@@ -8,4 +8,3 @@ INSTALL(FILES
 ADD_SUBDIRECTORY(products)
 ADD_SUBDIRECTORY(util)
 ADD_SUBDIRECTORY(arch)
-ADD_SUBDIRECTORY(functors)
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h
@@ -30,7 +30,6 @@ struct CommaInitializer
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::Index Index;

-  EIGEN_DEVICE_FUNC
  inline CommaInitializer(XprType& xpr, const Scalar& s)
    : m_xpr(xpr), m_row(0), m_col(1), m_currentBlockRows(1)
  {
@@ -38,7 +37,6 @@ struct CommaInitializer
  }

  template<typename OtherDerived>
-  EIGEN_DEVICE_FUNC
  inline CommaInitializer(XprType& xpr, const DenseBase<OtherDerived>& other)
    : m_xpr(xpr), m_row(0), m_col(other.cols()), m_currentBlockRows(other.rows())
  {
@@ -48,7 +46,6 @@ struct CommaInitializer
  /* Copy/Move constructor which transfers ownership. This is crucial in 
   * absence of return value optimization to avoid assertions during destruction. */
  // FIXME in C++11 mode this could be replaced by a proper RValue constructor
-  EIGEN_DEVICE_FUNC
  inline CommaInitializer(const CommaInitializer& o)
  : m_xpr(o.m_xpr), m_row(o.m_row), m_col(o.m_col), m_currentBlockRows(o.m_currentBlockRows) {
    // Mark original object as finished. In absence of R-value references we need to const_cast:
@@ -58,7 +55,6 @@ struct CommaInitializer
  }

  /* inserts a scalar value in the target matrix */
-  EIGEN_DEVICE_FUNC
  CommaInitializer& operator,(const Scalar& s)
  {
    if (m_col==m_xpr.cols())
@@ -78,7 +74,6 @@ struct CommaInitializer

  /* inserts a matrix expression in the target matrix */
  template<typename OtherDerived>
-  EIGEN_DEVICE_FUNC
  CommaInitializer& operator,(const DenseBase<OtherDerived>& other)
  {
    if(other.cols()==0 || other.rows()==0)
@@ -104,7 +99,6 @@ struct CommaInitializer
    return *this;
  }

-  EIGEN_DEVICE_FUNC
  inline ~CommaInitializer()
  {
    eigen_assert((m_row+m_currentBlockRows) == m_xpr.rows()
@@ -119,10 +113,9 @@ struct CommaInitializer
    * quaternion.fromRotationMatrix((Matrix3f() << axis0, axis1, axis2).finished());
    * \endcode
    */
-  EIGEN_DEVICE_FUNC
  inline XprType& finished() { return m_xpr; }

-  XprType& m_xpr;           // target expression
+  XprType& m_xpr;   // target expression
  Index m_row;              // current row id
  Index m_col;              // current col id
  Index m_currentBlockRows; // current block height
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -81,7 +81,8 @@ struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
        )
     ),
    Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit),
-    CoeffReadCost = LhsCoeffReadCost + RhsCoeffReadCost + functor_traits<BinaryOp>::Cost
+    Cost0 = EIGEN_ADD_COST(LhsCoeffReadCost,RhsCoeffReadCost),
+    CoeffReadCost = EIGEN_ADD_COST(Cost0,functor_traits<BinaryOp>::Cost)
  };
 };
 } // end namespace internal
@@ -122,7 +123,6 @@ class CwiseBinaryOp : internal::no_assignment_operator,
    typedef typename internal::remove_reference<LhsNested>::type _LhsNested;
    typedef typename internal::remove_reference<RhsNested>::type _RhsNested;

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp())
      : m_lhs(aLhs), m_rhs(aRhs), m_functor(func)
    {
@@ -132,7 +132,6 @@ class CwiseBinaryOp : internal::no_assignment_operator,
      eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index rows() const {
      // return the fixed size type if available to enable compile time optimizations
      if (internal::traits<typename internal::remove_all<LhsNested>::type>::RowsAtCompileTime==Dynamic)
@@ -140,7 +139,6 @@ class CwiseBinaryOp : internal::no_assignment_operator,
      else
        return m_lhs.rows();
    }
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index cols() const {
      // return the fixed size type if available to enable compile time optimizations
      if (internal::traits<typename internal::remove_all<LhsNested>::type>::ColsAtCompileTime==Dynamic)
@@ -150,13 +148,10 @@ class CwiseBinaryOp : internal::no_assignment_operator,
    }

    /** \returns the left hand side nested expression */
-    EIGEN_DEVICE_FUNC
    const _LhsNested& lhs() const { return m_lhs; }
    /** \returns the right hand side nested expression */
-    EIGEN_DEVICE_FUNC
    const _RhsNested& rhs() const { return m_rhs; }
    /** \returns the functor representing the binary operation */
-    EIGEN_DEVICE_FUNC
    const BinaryOp& functor() const { return m_functor; }

  protected:
@@ -175,7 +170,6 @@ class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Dense>
    typedef typename internal::dense_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type Base;
    EIGEN_DENSE_PUBLIC_INTERFACE( Derived )

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
    {
      return derived().functor()(derived().lhs().coeff(rowId, colId),
@@ -189,7 +183,6 @@ class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Dense>
                                          derived().rhs().template packet<LoadMode>(rowId, colId));
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
    {
      return derived().functor()(derived().lhs().coeff(index),
@@ -235,4 +228,3 @@ MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 } // end namespace Eigen

 #endif // EIGEN_CWISE_BINARY_OP_H
-
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -54,7 +54,6 @@ class CwiseNullaryOp : internal::no_assignment_operator,
    typedef typename internal::dense_xpr_base<CwiseNullaryOp>::type Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(CwiseNullaryOp)

-    EIGEN_DEVICE_FUNC
    CwiseNullaryOp(Index nbRows, Index nbCols, const NullaryOp& func = NullaryOp())
      : m_rows(nbRows), m_cols(nbCols), m_functor(func)
    {
@@ -64,12 +63,9 @@ class CwiseNullaryOp : internal::no_assignment_operator,
            && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == nbCols));
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index rows() const { return m_rows.value(); }
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index cols() const { return m_cols.value(); }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
    {
      return m_functor(rowId, colId);
@@ -81,7 +77,6 @@ class CwiseNullaryOp : internal::no_assignment_operator,
      return m_functor.packetOp(rowId, colId);
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
    {
      return m_functor(index);
@@ -94,7 +89,6 @@ class CwiseNullaryOp : internal::no_assignment_operator,
    }

    /** \returns the functor representing the nullary operation */
-    EIGEN_DEVICE_FUNC
    const NullaryOp& functor() const { return m_functor; }

  protected:
@@ -138,9 +132,6 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
  *
  * The template parameter \a CustomNullaryOp is the type of the functor.
  *
-  * Here is an example with C++11 random generators: \include random_cpp11.cpp
-  * Output: \verbinclude random_cpp11.out
-  * 
  * \sa class CwiseNullaryOp
  */
 template<typename Derived>
@@ -749,7 +740,6 @@ namespace internal {
 template<typename Derived, bool Big = (Derived::SizeAtCompileTime>=16)>
 struct setIdentity_impl
 {
-  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Derived& run(Derived& m)
  {
    return m = Derived::Identity(m.rows(), m.cols());
@@ -760,7 +750,6 @@ template<typename Derived>
 struct setIdentity_impl<Derived, true>
 {
  typedef typename Derived::Index Index;
-  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Derived& run(Derived& m)
  {
    m.setZero();
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@@ -47,7 +47,7 @@ struct traits<CwiseUnaryOp<UnaryOp, XprType> >
    Flags = _XprTypeNested::Flags & (
      HereditaryBits | LinearAccessBit | AlignedBit
      | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0)),
-    CoeffReadCost = _XprTypeNested::CoeffReadCost + functor_traits<UnaryOp>::Cost
+    CoeffReadCost = EIGEN_ADD_COST(_XprTypeNested::CoeffReadCost, functor_traits<UnaryOp>::Cost)
  };
 };
 }
@@ -64,26 +64,20 @@ class CwiseUnaryOp : internal::no_assignment_operator,
    typedef typename CwiseUnaryOpImpl<UnaryOp, XprType,typename internal::traits<XprType>::StorageKind>::Base Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryOp)

-    EIGEN_DEVICE_FUNC
    inline CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
      : m_xpr(xpr), m_functor(func) {}

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index rows() const { return m_xpr.rows(); }
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index cols() const { return m_xpr.cols(); }

    /** \returns the functor representing the unary operation */
-    EIGEN_DEVICE_FUNC
    const UnaryOp& functor() const { return m_functor; }

    /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
    const typename internal::remove_all<typename XprType::Nested>::type&
    nestedExpression() const { return m_xpr; }

    /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
    typename internal::remove_all<typename XprType::Nested>::type&
    nestedExpression() { return m_xpr.const_cast_derived(); }

@@ -104,7 +98,6 @@ class CwiseUnaryOpImpl<UnaryOp,XprType,Dense>
    typedef typename internal::dense_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
    {
      return derived().functor()(derived().nestedExpression().coeff(rowId, colId));
@@ -116,14 +109,12 @@ class CwiseUnaryOpImpl<UnaryOp,XprType,Dense>
      return derived().functor().packetOp(derived().nestedExpression().template packet<LoadMode>(rowId, colId));
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
    {
      return derived().functor()(derived().nestedExpression().coeff(index));
    }

    template<int LoadMode>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
    {
      return derived().functor().packetOp(derived().nestedExpression().template packet<LoadMode>(index));
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -40,15 +40,14 @@ static inline void check_DenseIndex_is_signed() {
  */
 template<typename Derived> class DenseBase
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-  : public internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                                     typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>
+  : public internal::special_scalar_op_base<Derived, typename internal::traits<Derived>::Scalar,
+                                            typename NumTraits<typename internal::traits<Derived>::Scalar>::Real,
+                                            DenseCoeffsBase<Derived> >
 #else
  : public DenseCoeffsBase<Derived>
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 {
  public:
-    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator*;

    class InnerIterator;

@@ -63,8 +62,9 @@ template<typename Derived> class DenseBase
    typedef typename internal::traits<Derived>::Scalar Scalar;
    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef internal::special_scalar_op_base<Derived,Scalar,RealScalar, DenseCoeffsBase<Derived> > Base;

-    typedef DenseCoeffsBase<Derived> Base;
+    using Base::operator*;
    using Base::derived;
    using Base::const_cast_derived;
    using Base::rows;
@@ -182,19 +182,13 @@ template<typename Derived> class DenseBase

    /** \returns the number of nonzero coefficients which is in practice the number
      * of stored coefficients. */
-    EIGEN_DEVICE_FUNC
    inline Index nonZeros() const { return size(); }
-    /** \returns true if either the number of rows or the number of columns is equal to 1.
-      * In other words, this function returns
-      * \code rows()==1 || cols()==1 \endcode
-      * \sa rows(), cols(), IsVectorAtCompileTime. */

    /** \returns the outer size.
      *
      * \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension
      * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of columns for a
      * column-major matrix, and the number of rows for a row-major matrix. */
-    EIGEN_DEVICE_FUNC
    Index outerSize() const
    {
      return IsVectorAtCompileTime ? 1
@@ -206,7 +200,6 @@ template<typename Derived> class DenseBase
      * \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension
      * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a 
      * column-major matrix, and the number of columns for a row-major matrix. */
-    EIGEN_DEVICE_FUNC
    Index innerSize() const
    {
      return IsVectorAtCompileTime ? this->size()
@@ -217,7 +210,6 @@ template<typename Derived> class DenseBase
      * Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and does
      * nothing else.
      */
-    EIGEN_DEVICE_FUNC
    void resize(Index newSize)
    {
      EIGEN_ONLY_USED_FOR_DEBUG(newSize);
@@ -228,7 +220,6 @@ template<typename Derived> class DenseBase
      * Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and does
      * nothing else.
      */
-    EIGEN_DEVICE_FUNC
    void resize(Index nbRows, Index nbCols)
    {
      EIGEN_ONLY_USED_FOR_DEBUG(nbRows);
@@ -252,54 +243,44 @@ template<typename Derived> class DenseBase

    /** Copies \a other into *this. \returns a reference to *this. */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator=(const DenseBase<OtherDerived>& other);

    /** Special case of the template operator=, in order to prevent the compiler
      * from generating a default operator= (issue hit with g++ 4.1)
      */
-    EIGEN_DEVICE_FUNC
    Derived& operator=(const DenseBase& other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator=(const EigenBase<OtherDerived> &other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator+=(const EigenBase<OtherDerived> &other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator-=(const EigenBase<OtherDerived> &other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator=(const ReturnByValue<OtherDerived>& func);

-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** Copies \a other into *this without evaluating other. \returns a reference to *this. */
+    /** \internal Copies \a other into *this without evaluating other. \returns a reference to *this. */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& lazyAssign(const DenseBase<OtherDerived>& other);
-#endif // not EIGEN_PARSED_BY_DOXYGEN

-    EIGEN_DEVICE_FUNC
+    /** \internal Evaluates \a other into *this. \returns a reference to *this. */
+    template<typename OtherDerived>
+    Derived& lazyAssign(const ReturnByValue<OtherDerived>& other);
+
    CommaInitializer<Derived> operator<< (const Scalar& s);

    template<unsigned int Added,unsigned int Removed>
    const Flagged<Derived, Added, Removed> flagged() const;

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    CommaInitializer<Derived> operator<< (const DenseBase<OtherDerived>& other);

-    EIGEN_DEVICE_FUNC
    Eigen::Transpose<Derived> transpose();
-    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
-    EIGEN_DEVICE_FUNC
+	typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
    ConstTransposeReturnType transpose() const;
-    EIGEN_DEVICE_FUNC
    void transposeInPlace();
 #ifndef EIGEN_NO_DEBUG
  protected:
@@ -309,68 +290,65 @@ template<typename Derived> class DenseBase
 #endif


-    EIGEN_DEVICE_FUNC static const ConstantReturnType
+    static const ConstantReturnType
    Constant(Index rows, Index cols, const Scalar& value);
-    EIGEN_DEVICE_FUNC static const ConstantReturnType
+    static const ConstantReturnType
    Constant(Index size, const Scalar& value);
-    EIGEN_DEVICE_FUNC static const ConstantReturnType
+    static const ConstantReturnType
    Constant(const Scalar& value);

-    EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType
+    static const SequentialLinSpacedReturnType
    LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high);
-    EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
+    static const RandomAccessLinSpacedReturnType
    LinSpaced(Index size, const Scalar& low, const Scalar& high);
-    EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType
+    static const SequentialLinSpacedReturnType
    LinSpaced(Sequential_t, const Scalar& low, const Scalar& high);
-    EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
+    static const RandomAccessLinSpacedReturnType
    LinSpaced(const Scalar& low, const Scalar& high);

-    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+    template<typename CustomNullaryOp>
    static const CwiseNullaryOp<CustomNullaryOp, Derived>
    NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func);
-    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+    template<typename CustomNullaryOp>
    static const CwiseNullaryOp<CustomNullaryOp, Derived>
    NullaryExpr(Index size, const CustomNullaryOp& func);
-    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+    template<typename CustomNullaryOp>
    static const CwiseNullaryOp<CustomNullaryOp, Derived>
    NullaryExpr(const CustomNullaryOp& func);

-    EIGEN_DEVICE_FUNC static const ConstantReturnType Zero(Index rows, Index cols);
-    EIGEN_DEVICE_FUNC static const ConstantReturnType Zero(Index size);
-    EIGEN_DEVICE_FUNC static const ConstantReturnType Zero();
-    EIGEN_DEVICE_FUNC static const ConstantReturnType Ones(Index rows, Index cols);
-    EIGEN_DEVICE_FUNC static const ConstantReturnType Ones(Index size);
-    EIGEN_DEVICE_FUNC static const ConstantReturnType Ones();
+    static const ConstantReturnType Zero(Index rows, Index cols);
+    static const ConstantReturnType Zero(Index size);
+    static const ConstantReturnType Zero();
+    static const ConstantReturnType Ones(Index rows, Index cols);
+    static const ConstantReturnType Ones(Index size);
+    static const ConstantReturnType Ones();

-    EIGEN_DEVICE_FUNC void fill(const Scalar& value);
-    EIGEN_DEVICE_FUNC Derived& setConstant(const Scalar& value);
-    EIGEN_DEVICE_FUNC Derived& setLinSpaced(Index size, const Scalar& low, const Scalar& high);
-    EIGEN_DEVICE_FUNC Derived& setLinSpaced(const Scalar& low, const Scalar& high);
-    EIGEN_DEVICE_FUNC Derived& setZero();
-    EIGEN_DEVICE_FUNC Derived& setOnes();
-    EIGEN_DEVICE_FUNC Derived& setRandom();
+    void fill(const Scalar& value);
+    Derived& setConstant(const Scalar& value);
+    Derived& setLinSpaced(Index size, const Scalar& low, const Scalar& high);
+    Derived& setLinSpaced(const Scalar& low, const Scalar& high);
+    Derived& setZero();
+    Derived& setOnes();
+    Derived& setRandom();

-    template<typename OtherDerived> EIGEN_DEVICE_FUNC
+    template<typename OtherDerived>
    bool isApprox(const DenseBase<OtherDerived>& other,
                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    EIGEN_DEVICE_FUNC 
    bool isMuchSmallerThan(const RealScalar& other,
                           const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC
+    template<typename OtherDerived>
    bool isMuchSmallerThan(const DenseBase<OtherDerived>& other,
                           const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;

-    EIGEN_DEVICE_FUNC bool isApproxToConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    EIGEN_DEVICE_FUNC bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    EIGEN_DEVICE_FUNC bool isZero(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    EIGEN_DEVICE_FUNC bool isOnes(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    bool isApproxToConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    bool isZero(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    bool isOnes(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
    
    inline bool hasNaN() const;
    inline bool allFinite() const;

-    EIGEN_DEVICE_FUNC
    inline Derived& operator*=(const Scalar& other);
-    EIGEN_DEVICE_FUNC
    inline Derived& operator/=(const Scalar& other);

    typedef typename internal::add_const_on_value_type<typename internal::eval<Derived>::type>::type EvalReturnType;
@@ -379,7 +357,6 @@ template<typename Derived> class DenseBase
      * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
      * a const reference, in order to avoid a useless copy.
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE EvalReturnType eval() const
    {
      // Even though MSVC does not honor strong inlining when the return type
@@ -392,7 +369,6 @@ template<typename Derived> class DenseBase
      *
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    void swap(const DenseBase<OtherDerived>& other,
              int = OtherDerived::ThisConstantIsPrivateInPlainObjectBase)
    {
@@ -403,52 +379,46 @@ template<typename Derived> class DenseBase
      *
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    void swap(PlainObjectBase<OtherDerived>& other)
    {
      SwapWrapper<Derived>(derived()).lazyAssign(other.derived());
    }


-    EIGEN_DEVICE_FUNC inline const NestByValue<Derived> nestByValue() const;
-    EIGEN_DEVICE_FUNC inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
-    EIGEN_DEVICE_FUNC inline ForceAlignedAccess<Derived> forceAlignedAccess();
-    template<bool Enable> EIGEN_DEVICE_FUNC
-    inline const typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf() const;
-    template<bool Enable> EIGEN_DEVICE_FUNC
-    inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf();
+    inline const NestByValue<Derived> nestByValue() const;
+    inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
+    inline ForceAlignedAccess<Derived> forceAlignedAccess();
+    template<bool Enable> inline const typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf() const;
+    template<bool Enable> inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf();

-    EIGEN_DEVICE_FUNC Scalar sum() const;
-    EIGEN_DEVICE_FUNC Scalar mean() const;
-    EIGEN_DEVICE_FUNC Scalar trace() const;
+    Scalar sum() const;
+    Scalar mean() const;
+    Scalar trace() const;

-    EIGEN_DEVICE_FUNC Scalar prod() const;
+    Scalar prod() const;

-    EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar minCoeff() const;
-    EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar maxCoeff() const;
+    typename internal::traits<Derived>::Scalar minCoeff() const;
+    typename internal::traits<Derived>::Scalar maxCoeff() const;

-    template<typename IndexType> EIGEN_DEVICE_FUNC
+    template<typename IndexType>
    typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const;
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+    template<typename IndexType>
    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* row, IndexType* col) const;
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+    template<typename IndexType>
    typename internal::traits<Derived>::Scalar minCoeff(IndexType* index) const;
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+    template<typename IndexType>
    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const;

    template<typename BinaryOp>
-    EIGEN_DEVICE_FUNC
    typename internal::result_of<BinaryOp(typename internal::traits<Derived>::Scalar)>::type
    redux(const BinaryOp& func) const;

    template<typename Visitor>
-    EIGEN_DEVICE_FUNC
    void visit(Visitor& func) const;

    inline const WithFormat<Derived> format(const IOFormat& fmt) const;

    /** \returns the unique coefficient of a 1x1 expression */
-    EIGEN_DEVICE_FUNC
    CoeffReturnType value() const
    {
      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
@@ -456,8 +426,8 @@ template<typename Derived> class DenseBase
      return derived().coeff(0,0);
    }

-    bool all() const;
-    bool any() const;
+    bool all(void) const;
+    bool any(void) const;
    Index count() const;

    typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
@@ -490,8 +460,10 @@ template<typename Derived> class DenseBase
    template<int p> RealScalar lpNorm() const;

    template<int RowFactor, int ColFactor>
-    const Replicate<Derived,RowFactor,ColFactor> replicate() const;
-    const Replicate<Derived,Dynamic,Dynamic> replicate(Index rowFacor,Index colFactor) const;
+    inline const Replicate<Derived,RowFactor,ColFactor> replicate() const;
+    
+    typedef Replicate<Derived,Dynamic,Dynamic> ReplicateReturnType;
+    inline const ReplicateReturnType replicate(Index rowFacor,Index colFactor) const;

    typedef Reverse<Derived, BothDirections> ReverseReturnType;
    typedef const Reverse<const Derived, BothDirections> ConstReverseReturnType;
@@ -506,18 +478,27 @@ template<typename Derived> class DenseBase
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS

+#ifdef EIGEN2_SUPPORT
+
+    Block<Derived> corner(CornerType type, Index cRows, Index cCols);
+    const Block<Derived> corner(CornerType type, Index cRows, Index cCols) const;
+    template<int CRows, int CCols>
+    Block<Derived, CRows, CCols> corner(CornerType type);
+    template<int CRows, int CCols>
+    const Block<Derived, CRows, CCols> corner(CornerType type) const;
+
+#endif // EIGEN2_SUPPORT
+

    // disable the use of evalTo for dense objects with a nice compilation error
-    template<typename Dest>
-    EIGEN_DEVICE_FUNC
-    inline void evalTo(Dest& ) const
+    template<typename Dest> inline void evalTo(Dest& ) const
    {
      EIGEN_STATIC_ASSERT((internal::is_same<Dest,void>::value),THE_EVAL_EVALTO_FUNCTION_SHOULD_NEVER_BE_CALLED_FOR_DENSE_OBJECTS);
    }

  protected:
    /** Default constructor. Do nothing. */
-    EIGEN_DEVICE_FUNC DenseBase()
+    DenseBase()
    {
      /* Just checks for self-consistency of the flags.
       * Only do it when debugging Eigen, as this borders on paranoiac and could slow compilation down
@@ -530,9 +511,9 @@ template<typename Derived> class DenseBase
    }

  private:
-    EIGEN_DEVICE_FUNC explicit DenseBase(int);
-    EIGEN_DEVICE_FUNC DenseBase(int,int);
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC explicit DenseBase(const DenseBase<OtherDerived>&);
+    explicit DenseBase(int);
+    DenseBase(int,int);
+    template<typename OtherDerived> explicit DenseBase(const DenseBase<OtherDerived>&);
 };

 } // end namespace Eigen
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@@ -61,7 +61,6 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
    using Base::size;
    using Base::derived;

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) const
    {
      return int(Derived::RowsAtCompileTime) == 1 ? 0
@@ -70,7 +69,6 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
          : inner;
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner) const
    {
      return int(Derived::ColsAtCompileTime) == 1 ? 0
@@ -93,7 +91,6 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
      *
      * \sa operator()(Index,Index) const, coeffRef(Index,Index), coeff(Index) const
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const
    {
      eigen_internal_assert(row >= 0 && row < rows()
@@ -101,7 +98,6 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
      return derived().coeff(row, col);
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
    {
      return coeff(rowIndexByOuterInner(outer, inner),
@@ -112,7 +108,6 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
      *
      * \sa operator()(Index,Index), operator[](Index)
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType operator()(Index row, Index col) const
    {
      eigen_assert(row >= 0 && row < rows()
@@ -135,7 +130,6 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
      * \sa operator[](Index) const, coeffRef(Index), coeff(Index,Index) const
      */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
    coeff(Index index) const
    {
@@ -152,12 +146,13 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
      * z() const, w() const
      */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
    operator[](Index index) const
    {
+      #ifndef EIGEN2_SUPPORT
      EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                          THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
+      #endif
      eigen_assert(index >= 0 && index < size());
      return derived().coeff(index);
    }
@@ -172,7 +167,6 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
      * z() const, w() const
      */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
    operator()(Index index) const
    {
@@ -182,25 +176,21 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>

    /** equivalent to operator[](0).  */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
    x() const { return (*this)[0]; }

    /** equivalent to operator[](1).  */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
    y() const { return (*this)[1]; }

    /** equivalent to operator[](2).  */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
    z() const { return (*this)[2]; }

    /** equivalent to operator[](3).  */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE CoeffReturnType
    w() const { return (*this)[3]; }

@@ -321,7 +311,6 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      *
      * \sa operator()(Index,Index), coeff(Index, Index) const, coeffRef(Index)
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col)
    {
      eigen_internal_assert(row >= 0 && row < rows()
@@ -329,7 +318,6 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      return derived().coeffRef(row, col);
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    coeffRefByOuterInner(Index outer, Index inner)
    {
@@ -342,7 +330,6 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      * \sa operator[](Index)
      */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    operator()(Index row, Index col)
    {
@@ -367,7 +354,6 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      * \sa operator[](Index), coeff(Index) const, coeffRef(Index,Index)
      */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    coeffRef(Index index)
    {
@@ -382,12 +368,13 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
      */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    operator[](Index index)
    {
+      #ifndef EIGEN2_SUPPORT
      EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                          THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
+      #endif
      eigen_assert(index >= 0 && index < size());
      return derived().coeffRef(index);
    }
@@ -401,7 +388,6 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
      */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    operator()(Index index)
    {
@@ -411,25 +397,21 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,

    /** equivalent to operator[](0).  */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    x() { return (*this)[0]; }

    /** equivalent to operator[](1).  */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    y() { return (*this)[1]; }

    /** equivalent to operator[](2).  */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    z() { return (*this)[2]; }

    /** equivalent to operator[](3).  */

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    w() { return (*this)[3]; }

@@ -491,7 +473,6 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      */

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void copyCoeff(Index row, Index col, const DenseBase<OtherDerived>& other)
    {
      eigen_internal_assert(row >= 0 && row < rows()
@@ -508,7 +489,6 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      */

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
    {
      eigen_internal_assert(index >= 0 && index < size());
@@ -517,7 +497,6 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,


    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void copyCoeffByOuterInner(Index outer, Index inner, const DenseBase<OtherDerived>& other)
    {
      const Index row = rowIndexByOuterInner(outer,inner);
@@ -602,7 +581,6 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
      *
      * \sa outerStride(), rowStride(), colStride()
      */
-    EIGEN_DEVICE_FUNC
    inline Index innerStride() const
    {
      return derived().innerStride();
@@ -613,7 +591,6 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
      *
      * \sa innerStride(), rowStride(), colStride()
      */
-    EIGEN_DEVICE_FUNC
    inline Index outerStride() const
    {
      return derived().outerStride();
@@ -629,7 +606,6 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
      *
      * \sa innerStride(), outerStride(), colStride()
      */
-    EIGEN_DEVICE_FUNC
    inline Index rowStride() const
    {
      return Derived::IsRowMajor ? outerStride() : innerStride();
@@ -639,7 +615,6 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
      *
      * \sa innerStride(), outerStride(), rowStride()
      */
-    EIGEN_DEVICE_FUNC
    inline Index colStride() const
    {
      return Derived::IsRowMajor ? innerStride() : outerStride();
@@ -677,7 +652,6 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
      *
      * \sa outerStride(), rowStride(), colStride()
      */
-    EIGEN_DEVICE_FUNC
    inline Index innerStride() const
    {
      return derived().innerStride();
@@ -688,7 +662,6 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
      *
      * \sa innerStride(), rowStride(), colStride()
      */
-    EIGEN_DEVICE_FUNC
    inline Index outerStride() const
    {
      return derived().outerStride();
@@ -704,7 +677,6 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
      *
      * \sa innerStride(), outerStride(), colStride()
      */
-    EIGEN_DEVICE_FUNC
    inline Index rowStride() const
    {
      return Derived::IsRowMajor ? outerStride() : innerStride();
@@ -714,7 +686,6 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
      *
      * \sa innerStride(), outerStride(), rowStride()
      */
-    EIGEN_DEVICE_FUNC
    inline Index colStride() const
    {
      return Derived::IsRowMajor ? innerStride() : outerStride();
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -3,7 +3,7 @@
 //
 // Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2010-2013 Hauke Heibel <hauke.heibel@gmail.com>
+// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -24,9 +24,7 @@ namespace internal {

 struct constructor_without_unaligned_array_assert {};

-template<typename T, int Size>
-EIGEN_DEVICE_FUNC
-void check_static_allocation_size()
+template<typename T, int Size> void check_static_allocation_size()
 {
  // if EIGEN_STACK_ALLOCATION_LIMIT is defined to 0, then no limit
  #if EIGEN_STACK_ALLOCATION_LIMIT
@@ -40,20 +38,18 @@ void check_static_allocation_size()
  */
 template <typename T, int Size, int MatrixOrArrayOptions,
          int Alignment = (MatrixOrArrayOptions&DontAlign) ? 0
-                        : (((Size*sizeof(T))%EIGEN_ALIGN_BYTES)==0) ? EIGEN_ALIGN_BYTES
+                        : (((Size*sizeof(T))%16)==0) ? 16
                        : 0 >
 struct plain_array
 {
  T array[Size];

-  EIGEN_DEVICE_FUNC
-  plain_array()
+  plain_array() 
  { 
    check_static_allocation_size<T,Size>();
  }

-  EIGEN_DEVICE_FUNC
-  plain_array(constructor_without_unaligned_array_assert)
+  plain_array(constructor_without_unaligned_array_assert) 
  { 
    check_static_allocation_size<T,Size>();
  }
@@ -68,31 +64,29 @@ struct plain_array
  template<typename PtrType>
  EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; }
  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \
+    eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & sizemask) == 0 \
              && "this assertion is explained here: " \
              "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
              " **** READ THIS WEB PAGE !!! ****");
 #else
  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(array) & (sizemask)) == 0 \
+    eigen_assert((reinterpret_cast<size_t>(array) & sizemask) == 0 \
              && "this assertion is explained here: " \
              "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
              " **** READ THIS WEB PAGE !!! ****");
 #endif

 template <typename T, int Size, int MatrixOrArrayOptions>
-struct plain_array<T, Size, MatrixOrArrayOptions, EIGEN_ALIGN_BYTES>
+struct plain_array<T, Size, MatrixOrArrayOptions, 16>
 {
-  EIGEN_USER_ALIGN_DEFAULT T array[Size];
+  EIGEN_USER_ALIGN16 T array[Size];

-  EIGEN_DEVICE_FUNC
  plain_array() 
  { 
-    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(EIGEN_ALIGN_BYTES-1);
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(0xf);
    check_static_allocation_size<T,Size>();
  }

-  EIGEN_DEVICE_FUNC
  plain_array(constructor_without_unaligned_array_assert) 
  { 
    check_static_allocation_size<T,Size>();
@@ -102,9 +96,9 @@ struct plain_array<T, Size, MatrixOrArrayOptions, EIGEN_ALIGN_BYTES>
 template <typename T, int MatrixOrArrayOptions, int Alignment>
 struct plain_array<T, 0, MatrixOrArrayOptions, Alignment>
 {
-  EIGEN_USER_ALIGN_DEFAULT T array[1];
-  EIGEN_DEVICE_FUNC plain_array() {}
-  EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {}
+  EIGEN_USER_ALIGN16 T array[1];
+  plain_array() {}
+  plain_array(constructor_without_unaligned_array_assert) {}
 };

 } // end namespace internal
@@ -128,44 +122,41 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
 {
    internal::plain_array<T,Size,_Options> m_data;
  public:
-    EIGEN_DEVICE_FUNC DenseStorage() {}
-    EIGEN_DEVICE_FUNC
+    DenseStorage() {}
    DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()) {}
-    EIGEN_DEVICE_FUNC 
    DenseStorage(const DenseStorage& other) : m_data(other.m_data) {}
-    EIGEN_DEVICE_FUNC 
    DenseStorage& operator=(const DenseStorage& other)
-    { 
+    {
      if (this != &other) m_data = other.m_data;
-      return *this; 
+      return *this;
    }
-    EIGEN_DEVICE_FUNC DenseStorage(DenseIndex,DenseIndex,DenseIndex) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
-    EIGEN_DEVICE_FUNC static DenseIndex rows(void) {return _Rows;}
-    EIGEN_DEVICE_FUNC static DenseIndex cols(void) {return _Cols;}
-    EIGEN_DEVICE_FUNC void conservativeResize(DenseIndex,DenseIndex,DenseIndex) {}
-    EIGEN_DEVICE_FUNC void resize(DenseIndex,DenseIndex,DenseIndex) {}
-    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
-    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
+    DenseStorage(DenseIndex,DenseIndex,DenseIndex) {}
+    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
+    static DenseIndex rows(void) {return _Rows;}
+    static DenseIndex cols(void) {return _Cols;}
+    void conservativeResize(DenseIndex,DenseIndex,DenseIndex) {}
+    void resize(DenseIndex,DenseIndex,DenseIndex) {}
+    const T *data() const { return m_data.array; }
+    T *data() { return m_data.array; }
 };

 // null matrix
 template<typename T, int _Rows, int _Cols, int _Options> class DenseStorage<T, 0, _Rows, _Cols, _Options>
 {
  public:
-    EIGEN_DEVICE_FUNC DenseStorage() {}
-    EIGEN_DEVICE_FUNC DenseStorage(internal::constructor_without_unaligned_array_assert) {}
-    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) {}
-    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) { return *this; }
-    EIGEN_DEVICE_FUNC DenseStorage(DenseIndex,DenseIndex,DenseIndex) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& ) {}
-    EIGEN_DEVICE_FUNC static DenseIndex rows(void) {return _Rows;}
-    EIGEN_DEVICE_FUNC static DenseIndex cols(void) {return _Cols;}
-    EIGEN_DEVICE_FUNC void conservativeResize(DenseIndex,DenseIndex,DenseIndex) {}
-    EIGEN_DEVICE_FUNC void resize(DenseIndex,DenseIndex,DenseIndex) {}
-    EIGEN_DEVICE_FUNC const T *data() const { return 0; }
-    EIGEN_DEVICE_FUNC T *data() { return 0; }
+    DenseStorage() {}
+    DenseStorage(internal::constructor_without_unaligned_array_assert) {}
+    DenseStorage(const DenseStorage&) {}
+    DenseStorage& operator=(const DenseStorage&) { return *this; }
+    DenseStorage(DenseIndex,DenseIndex,DenseIndex) {}
+    void swap(DenseStorage& ) {}
+    static DenseIndex rows(void) {return _Rows;}
+    static DenseIndex cols(void) {return _Cols;}
+    void conservativeResize(DenseIndex,DenseIndex,DenseIndex) {}
+    void resize(DenseIndex,DenseIndex,DenseIndex) {}
+    const T *data() const { return 0; }
+    T *data() { return 0; }
 };

 // more specializations for null matrices; these are necessary to resolve ambiguities
@@ -185,29 +176,29 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic
    DenseIndex m_rows;
    DenseIndex m_cols;
  public:
-    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0), m_cols(0) {}
+    DenseStorage() : m_rows(0), m_cols(0) {}
    DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {}
    DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows), m_cols(other.m_cols) {}
-    DenseStorage& operator=(const DenseStorage& other) 
-    { 
+    DenseStorage& operator=(const DenseStorage& other)
+    {
      if (this != &other)
      {
        m_data = other.m_data;
        m_rows = other.m_rows;
        m_cols = other.m_cols;
      }
-      return *this; 
+      return *this;
    }
    DenseStorage(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) : m_rows(nbRows), m_cols(nbCols) {}
    void swap(DenseStorage& other)
    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
-    EIGEN_DEVICE_FUNC DenseIndex rows() const {return m_rows;}
-    EIGEN_DEVICE_FUNC DenseIndex cols() const {return m_cols;}
+    DenseIndex rows() const {return m_rows;}
+    DenseIndex cols() const {return m_cols;}
    void conservativeResize(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) { m_rows = nbRows; m_cols = nbCols; }
    void resize(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) { m_rows = nbRows; m_cols = nbCols; }
-    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
-    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
+    const T *data() const { return m_data.array; }
+    T *data() { return m_data.array; }
 };

 // dynamic-size matrix with fixed-size storage and fixed width
@@ -216,27 +207,27 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si
    internal::plain_array<T,Size,_Options> m_data;
    DenseIndex m_rows;
  public:
-    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0) {}
+    DenseStorage() : m_rows(0) {}
    DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {}
    DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows) {}
-    DenseStorage& operator=(const DenseStorage& other) 
+    DenseStorage& operator=(const DenseStorage& other)
    {
      if (this != &other)
      {
        m_data = other.m_data;
        m_rows = other.m_rows;
      }
-      return *this; 
+      return *this;
    }
    DenseStorage(DenseIndex, DenseIndex nbRows, DenseIndex) : m_rows(nbRows) {}
    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
-    EIGEN_DEVICE_FUNC DenseIndex rows(void) const {return m_rows;}
-    EIGEN_DEVICE_FUNC DenseIndex cols(void) const {return _Cols;}
+    DenseIndex rows(void) const {return m_rows;}
+    DenseIndex cols(void) const {return _Cols;}
    void conservativeResize(DenseIndex, DenseIndex nbRows, DenseIndex) { m_rows = nbRows; }
    void resize(DenseIndex, DenseIndex nbRows, DenseIndex) { m_rows = nbRows; }
-    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
-    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
+    const T *data() const { return m_data.array; }
+    T *data() { return m_data.array; }
 };

 // dynamic-size matrix with fixed-size storage and fixed height
@@ -245,7 +236,7 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
    internal::plain_array<T,Size,_Options> m_data;
    DenseIndex m_cols;
  public:
-    EIGEN_DEVICE_FUNC DenseStorage() : m_cols(0) {}
+    DenseStorage() : m_cols(0) {}
    DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(0) {}
    DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_cols(other.m_cols) {}
@@ -260,12 +251,12 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
    }
    DenseStorage(DenseIndex, DenseIndex, DenseIndex nbCols) : m_cols(nbCols) {}
    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
-    EIGEN_DEVICE_FUNC DenseIndex rows(void) const {return _Rows;}
-    EIGEN_DEVICE_FUNC DenseIndex cols(void) const {return m_cols;}
+    DenseIndex rows(void) const {return _Rows;}
+    DenseIndex cols(void) const {return m_cols;}
    void conservativeResize(DenseIndex, DenseIndex, DenseIndex nbCols) { m_cols = nbCols; }
    void resize(DenseIndex, DenseIndex, DenseIndex nbCols) { m_cols = nbCols; }
-    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
-    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
+    const T *data() const { return m_data.array; }
+    T *data() { return m_data.array; }
 };

 // purely dynamic matrix.
@@ -275,28 +266,12 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
    DenseIndex m_rows;
    DenseIndex m_cols;
  public:
-    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0), m_cols(0) {}
+    DenseStorage() : m_data(0), m_rows(0), m_cols(0) {}
    DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(0), m_rows(0), m_cols(0) {}
    DenseStorage(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows), m_cols(nbCols)
    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-    DenseStorage(const DenseStorage& other)
-      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*other.m_cols))
-      , m_rows(other.m_rows)
-      , m_cols(other.m_cols)
-    {
-      internal::smart_copy(other.m_data, other.m_data+other.m_rows*other.m_cols, m_data);
-    }
-    DenseStorage& operator=(const DenseStorage& other)
-    {
-      if (this != &other)
-      {
-        DenseStorage tmp(other);
-        this->swap(tmp);
-      }
-      return *this;
-    }
 #ifdef EIGEN_HAVE_RVALUE_REFERENCES
    DenseStorage(DenseStorage&& other)
      : m_data(std::move(other.m_data))
@@ -317,8 +292,8 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
    ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
    void swap(DenseStorage& other)
    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
-    EIGEN_DEVICE_FUNC DenseIndex rows(void) const {return m_rows;}
-    EIGEN_DEVICE_FUNC DenseIndex cols(void) const {return m_cols;}
+    DenseIndex rows(void) const {return m_rows;}
+    DenseIndex cols(void) const {return m_cols;}
    void conservativeResize(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
    {
      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*m_cols);
@@ -339,8 +314,11 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
      m_rows = nbRows;
      m_cols = nbCols;
    }
-    EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
-    EIGEN_DEVICE_FUNC T *data() { return m_data; }
+    const T *data() const { return m_data; }
+    T *data() { return m_data; }
+  private:
+    DenseStorage(const DenseStorage&);
+    DenseStorage& operator=(const DenseStorage&);
 };

 // matrix with dynamic width and fixed height (so that matrix has dynamic size).
@@ -349,25 +327,10 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
    T *m_data;
    DenseIndex m_cols;
  public:
-    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_cols(0) {}
+    DenseStorage() : m_data(0), m_cols(0) {}
    DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
    DenseStorage(DenseIndex size, DenseIndex, DenseIndex nbCols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(nbCols)
    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-    DenseStorage(const DenseStorage& other)
-      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(_Rows*other.m_cols))
-      , m_cols(other.m_cols)
-    {
-      internal::smart_copy(other.m_data, other.m_data+_Rows*m_cols, m_data);
-    }
-    DenseStorage& operator=(const DenseStorage& other)
-    {
-      if (this != &other)
-      {
-        DenseStorage tmp(other);
-        this->swap(tmp);
-      }
-      return *this;
-    }    
 #ifdef EIGEN_HAVE_RVALUE_REFERENCES
    DenseStorage(DenseStorage&& other)
      : m_data(std::move(other.m_data))
@@ -385,8 +348,8 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
 #endif
    ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
-    EIGEN_DEVICE_FUNC static DenseIndex rows(void) {return _Rows;}
-    EIGEN_DEVICE_FUNC DenseIndex cols(void) const {return m_cols;}
+    static DenseIndex rows(void) {return _Rows;}
+    DenseIndex cols(void) const {return m_cols;}
    void conservativeResize(DenseIndex size, DenseIndex, DenseIndex nbCols)
    {
      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, _Rows*m_cols);
@@ -405,8 +368,11 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
      }
      m_cols = nbCols;
    }
-    EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
-    EIGEN_DEVICE_FUNC T *data() { return m_data; }
+    const T *data() const { return m_data; }
+    T *data() { return m_data; }
+  private:
+    DenseStorage(const DenseStorage&);
+    DenseStorage& operator=(const DenseStorage&);
 };

 // matrix with dynamic height and fixed width (so that matrix has dynamic size).
@@ -415,25 +381,10 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
    T *m_data;
    DenseIndex m_rows;
  public:
-    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0) {}
+    DenseStorage() : m_data(0), m_rows(0) {}
    DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
    DenseStorage(DenseIndex size, DenseIndex nbRows, DenseIndex) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows)
    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-    DenseStorage(const DenseStorage& other)
-      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*_Cols))
-      , m_rows(other.m_rows)
-    {
-      internal::smart_copy(other.m_data, other.m_data+other.m_rows*_Cols, m_data);
-    }
-    DenseStorage& operator=(const DenseStorage& other)
-    {
-      if (this != &other)
-      {
-        DenseStorage tmp(other);
-        this->swap(tmp);
-      }
-      return *this;
-    }    
 #ifdef EIGEN_HAVE_RVALUE_REFERENCES
    DenseStorage(DenseStorage&& other)
      : m_data(std::move(other.m_data))
@@ -451,8 +402,8 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
 #endif
    ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
-    EIGEN_DEVICE_FUNC DenseIndex rows(void) const {return m_rows;}
-    EIGEN_DEVICE_FUNC static DenseIndex cols(void) {return _Cols;}
+    DenseIndex rows(void) const {return m_rows;}
+    static DenseIndex cols(void) {return _Cols;}
    void conservativeResize(DenseIndex size, DenseIndex nbRows, DenseIndex)
    {
      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*_Cols);
@@ -471,8 +422,11 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
      }
      m_rows = nbRows;
    }
-    EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
-    EIGEN_DEVICE_FUNC T *data() { return m_data; }
+    const T *data() const { return m_data; }
+    T *data() { return m_data; }
+  private:
+    DenseStorage(const DenseStorage&);
+    DenseStorage& operator=(const DenseStorage&);
 };

 } // end namespace Eigen
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -70,30 +70,20 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
    typedef typename internal::dense_xpr_base<Diagonal>::type Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)

-    EIGEN_DEVICE_FUNC
    inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}

    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)

-    EIGEN_DEVICE_FUNC
    inline Index rows() const
-    {
-      EIGEN_USING_STD_MATH(min);
-      return m_index.value()<0 ? (min)(Index(m_matrix.cols()),Index(m_matrix.rows()+m_index.value()))
-                               : (min)(Index(m_matrix.rows()),Index(m_matrix.cols()-m_index.value()));
-      
-    }
+    { return m_index.value()<0 ? (std::min<Index>)(m_matrix.cols(),m_matrix.rows()+m_index.value()) : (std::min<Index>)(m_matrix.rows(),m_matrix.cols()-m_index.value()); }

-    EIGEN_DEVICE_FUNC
    inline Index cols() const { return 1; }

-    EIGEN_DEVICE_FUNC
    inline Index innerStride() const
    {
      return m_matrix.outerStride() + 1;
    }

-    EIGEN_DEVICE_FUNC
    inline Index outerStride() const
    {
      return 0;
@@ -105,57 +95,47 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
                       const Scalar
                     >::type ScalarWithConstIfNotLvalue;

-    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }
-    EIGEN_DEVICE_FUNC
    inline const Scalar* data() const { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }

-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index row, Index)
    {
      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
      return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index row, Index) const
    {
      return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
    }

-    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index row, Index) const
    {
      return m_matrix.coeff(row+rowOffset(), row+colOffset());
    }

-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index idx)
    {
      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
      return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index idx) const
    {
      return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
    }

-    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index idx) const
    {
      return m_matrix.coeff(idx+rowOffset(), idx+colOffset());
    }

-    EIGEN_DEVICE_FUNC
    const typename internal::remove_all<typename MatrixType::Nested>::type& 
    nestedExpression() const 
    {
      return m_matrix;
    }

-    EIGEN_DEVICE_FUNC
    int index() const
    {
      return m_index.value();
@@ -167,11 +147,8 @@ template<typename MatrixType, int _DiagIndex> class Diagonal

  private:
    // some compilers may fail to optimize std::max etc in case of compile-time constants...
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index absDiagIndex() const { return m_index.value()>0 ? m_index.value() : -m_index.value(); }
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value()>0 ? 0 : -m_index.value(); }
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value()>0 ? m_index.value() : 0; }
    // triger a compile time error is someone try to call packet
    template<int LoadMode> typename MatrixType::PacketReturnType packet(Index) const;
@@ -213,18 +190,18 @@ MatrixBase<Derived>::diagonal() const
  *
  * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<DynamicIndex>::Type
+inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index)
 {
-  return typename DiagonalIndexReturnType<DynamicIndex>::Type(derived(), index);
+  return DiagonalDynamicIndexReturnType(derived(), index);
 }

 /** This is the const version of diagonal(Index). */
 template<typename Derived>
-inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<DynamicIndex>::Type
+inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index) const
 {
-  return typename ConstDiagonalIndexReturnType<DynamicIndex>::Type(derived(), index);
+  return ConstDiagonalDynamicIndexReturnType(derived(), index);
 }

 /** \returns an expression of the \a DiagIndex-th sub or super diagonal of the matrix \c *this
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -37,64 +37,63 @@ class DiagonalBase : public EigenBase<Derived>
    typedef DenseMatrixType DenseType;
    typedef DiagonalMatrix<Scalar,DiagonalVectorType::SizeAtCompileTime,DiagonalVectorType::MaxSizeAtCompileTime> PlainObject;

-    EIGEN_DEVICE_FUNC
    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
-    EIGEN_DEVICE_FUNC
    inline Derived& derived() { return *static_cast<Derived*>(this); }

-    EIGEN_DEVICE_FUNC
    DenseMatrixType toDenseMatrix() const { return derived(); }
    template<typename DenseDerived>
-    EIGEN_DEVICE_FUNC
    void evalTo(MatrixBase<DenseDerived> &other) const;
    template<typename DenseDerived>
-    EIGEN_DEVICE_FUNC
    void addTo(MatrixBase<DenseDerived> &other) const
    { other.diagonal() += diagonal(); }
    template<typename DenseDerived>
-    EIGEN_DEVICE_FUNC
    void subTo(MatrixBase<DenseDerived> &other) const
    { other.diagonal() -= diagonal(); }

-    EIGEN_DEVICE_FUNC
    inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
-    EIGEN_DEVICE_FUNC
    inline DiagonalVectorType& diagonal() { return derived().diagonal(); }

-    EIGEN_DEVICE_FUNC
    inline Index rows() const { return diagonal().size(); }
-    EIGEN_DEVICE_FUNC
    inline Index cols() const { return diagonal().size(); }

    /** \returns the diagonal matrix product of \c *this by the matrix \a matrix.
      */
    template<typename MatrixDerived>
-    EIGEN_DEVICE_FUNC
    const DiagonalProduct<MatrixDerived, Derived, OnTheLeft>
    operator*(const MatrixBase<MatrixDerived> &matrix) const
    {
      return DiagonalProduct<MatrixDerived, Derived, OnTheLeft>(matrix.derived(), derived());
    }

-    EIGEN_DEVICE_FUNC
    inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const DiagonalVectorType> >
    inverse() const
    {
      return diagonal().cwiseInverse();
    }
    
-    EIGEN_DEVICE_FUNC
    inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> >
    operator*(const Scalar& scalar) const
    {
      return diagonal() * scalar;
    }
-    EIGEN_DEVICE_FUNC
    friend inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> >
    operator*(const Scalar& scalar, const DiagonalBase& other)
    {
      return other.diagonal() * scalar;
    }
+    
+    #ifdef EIGEN2_SUPPORT
+    template<typename OtherDerived>
+    bool isApprox(const DiagonalBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
+    {
+      return diagonal().isApprox(other.diagonal(), precision);
+    }
+    template<typename OtherDerived>
+    bool isApprox(const MatrixBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
+    {
+      return toDenseMatrix().isApprox(other, precision);
+    }
+    #endif
 };

 template<typename Derived>
@@ -152,31 +151,24 @@ class DiagonalMatrix
  public:

    /** const version of diagonal(). */
-    EIGEN_DEVICE_FUNC
    inline const DiagonalVectorType& diagonal() const { return m_diagonal; }
    /** \returns a reference to the stored vector of diagonal coefficients. */
-    EIGEN_DEVICE_FUNC
    inline DiagonalVectorType& diagonal() { return m_diagonal; }

    /** Default constructor without initialization */
-    EIGEN_DEVICE_FUNC
    inline DiagonalMatrix() {}

    /** Constructs a diagonal matrix with given dimension  */
-    EIGEN_DEVICE_FUNC
    inline DiagonalMatrix(Index dim) : m_diagonal(dim) {}

    /** 2D constructor. */
-    EIGEN_DEVICE_FUNC
    inline DiagonalMatrix(const Scalar& x, const Scalar& y) : m_diagonal(x,y) {}

    /** 3D constructor. */
-    EIGEN_DEVICE_FUNC
    inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x,y,z) {}

    /** Copy constructor. */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    inline DiagonalMatrix(const DiagonalBase<OtherDerived>& other) : m_diagonal(other.diagonal()) {}

    #ifndef EIGEN_PARSED_BY_DOXYGEN
@@ -186,13 +178,11 @@ class DiagonalMatrix

    /** generic constructor from expression of the diagonal coefficients */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    explicit inline DiagonalMatrix(const MatrixBase<OtherDerived>& other) : m_diagonal(other)
    {}

    /** Copy operator. */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    DiagonalMatrix& operator=(const DiagonalBase<OtherDerived>& other)
    {
      m_diagonal = other.diagonal();
@@ -203,7 +193,6 @@ class DiagonalMatrix
    /** This is a special case of the templated operator=. Its purpose is to
      * prevent a default operator= from hiding the templated operator=.
      */
-    EIGEN_DEVICE_FUNC
    DiagonalMatrix& operator=(const DiagonalMatrix& other)
    {
      m_diagonal = other.diagonal();
@@ -212,19 +201,14 @@ class DiagonalMatrix
    #endif

    /** Resizes to given size. */
-    EIGEN_DEVICE_FUNC
    inline void resize(Index size) { m_diagonal.resize(size); }
    /** Sets all coefficients to zero. */
-    EIGEN_DEVICE_FUNC
    inline void setZero() { m_diagonal.setZero(); }
    /** Resizes and sets all coefficients to zero. */
-    EIGEN_DEVICE_FUNC
    inline void setZero(Index size) { m_diagonal.setZero(size); }
    /** Sets this matrix to be the identity matrix of the current size. */
-    EIGEN_DEVICE_FUNC
    inline void setIdentity() { m_diagonal.setOnes(); }
    /** Sets this matrix to be the identity matrix of the given size. */
-    EIGEN_DEVICE_FUNC
    inline void setIdentity(Index size) { m_diagonal.setOnes(size); }
 };

@@ -271,11 +255,9 @@ class DiagonalWrapper
    #endif

    /** Constructor from expression of diagonal coefficients to wrap. */
-    EIGEN_DEVICE_FUNC
    inline DiagonalWrapper(DiagonalVectorType& a_diagonal) : m_diagonal(a_diagonal) {}

    /** \returns a const reference to the wrapped expression of diagonal coefficients. */
-    EIGEN_DEVICE_FUNC
    const DiagonalVectorType& diagonal() const { return m_diagonal; }

  protected:
--- a/Eigen/src/Core/DiagonalProduct.h
+++ b/Eigen/src/Core/DiagonalProduct.h
@@ -34,8 +34,9 @@ struct traits<DiagonalProduct<MatrixType, DiagonalType, ProductOrder> >
    _Vectorizable = bool(int(MatrixType::Flags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagonalType::DiagonalVectorType::Flags)&PacketAccessBit))),
    _LinearAccessMask = (RowsAtCompileTime==1 || ColsAtCompileTime==1) ? LinearAccessBit : 0,

-    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixType::Flags)) | (_Vectorizable ? PacketAccessBit : 0) | AlignedBit,//(int(MatrixType::Flags)&int(DiagonalType::DiagonalVectorType::Flags)&AlignedBit),
-    CoeffReadCost = NumTraits<Scalar>::MulCost + MatrixType::CoeffReadCost + DiagonalType::DiagonalVectorType::CoeffReadCost
+    Flags = ((HereditaryBits|_LinearAccessMask|AlignedBit) & (unsigned int)(MatrixType::Flags)) | (_Vectorizable ? PacketAccessBit : 0),//(int(MatrixType::Flags)&int(DiagonalType::DiagonalVectorType::Flags)&AlignedBit),
+    Cost0 = EIGEN_ADD_COST(NumTraits<Scalar>::MulCost, MatrixType::CoeffReadCost),
+    CoeffReadCost = EIGEN_ADD_COST(Cost0,DiagonalType::DiagonalVectorType::CoeffReadCost)
  };
 };
 }
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -29,7 +29,6 @@ template<typename T, typename U,
 struct dot_nocheck
 {
  typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
-  EIGEN_DEVICE_FUNC
  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
  {
    return a.template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
@@ -40,7 +39,6 @@ template<typename T, typename U>
 struct dot_nocheck<T, U, true>
 {
  typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
-  EIGEN_DEVICE_FUNC
  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
  {
    return a.transpose().template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
@@ -61,7 +59,6 @@ struct dot_nocheck<T, U, true>
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
 typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
 MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
 {
@@ -76,6 +73,34 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
  return internal::dot_nocheck<Derived,OtherDerived>::run(*this, other);
 }

+#ifdef EIGEN2_SUPPORT
+/** \returns the dot product of *this with other, with the Eigen2 convention that the dot product is linear in the first variable
+  * (conjugating the second variable). Of course this only makes a difference in the complex case.
+  *
+  * This method is only available in EIGEN2_SUPPORT mode.
+  *
+  * \only_for_vectors
+  *
+  * \sa dot()
+  */
+template<typename Derived>
+template<typename OtherDerived>
+typename internal::traits<Derived>::Scalar
+MatrixBase<Derived>::eigen2_dot(const MatrixBase<OtherDerived>& other) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived,OtherDerived)
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
+    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
+  eigen_assert(size() == other.size());
+
+  return internal::dot_nocheck<OtherDerived,Derived>::run(other,*this);
+}
+#endif
+
+
 //---------- implementation of L2 norm and related functions ----------

 /** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the Frobenius norm.
@@ -139,7 +164,6 @@ template<typename Derived, int p>
 struct lpNorm_selector
 {
  typedef typename NumTraits<typename traits<Derived>::Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const MatrixBase<Derived>& m)
  {
    using std::pow;
@@ -150,7 +174,6 @@ struct lpNorm_selector
 template<typename Derived>
 struct lpNorm_selector<Derived, 1>
 {
-  EIGEN_DEVICE_FUNC
  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
  {
    return m.cwiseAbs().sum();
@@ -160,7 +183,6 @@ struct lpNorm_selector<Derived, 1>
 template<typename Derived>
 struct lpNorm_selector<Derived, 2>
 {
-  EIGEN_DEVICE_FUNC
  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
  {
    return m.norm();
@@ -170,7 +192,6 @@ struct lpNorm_selector<Derived, 2>
 template<typename Derived>
 struct lpNorm_selector<Derived, Infinity>
 {
-  EIGEN_DEVICE_FUNC
  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
  {
    return m.cwiseAbs().maxCoeff();
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@@ -31,40 +31,29 @@ template<typename Derived> struct EigenBase
  typedef typename internal::traits<Derived>::Index Index;

  /** \returns a reference to the derived object */
-  EIGEN_DEVICE_FUNC
  Derived& derived() { return *static_cast<Derived*>(this); }
  /** \returns a const reference to the derived object */
-  EIGEN_DEVICE_FUNC
  const Derived& derived() const { return *static_cast<const Derived*>(this); }

-  EIGEN_DEVICE_FUNC
  inline Derived& const_cast_derived() const
  { return *static_cast<Derived*>(const_cast<EigenBase*>(this)); }
-  EIGEN_DEVICE_FUNC
  inline const Derived& const_derived() const
  { return *static_cast<const Derived*>(this); }

  /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
-  EIGEN_DEVICE_FUNC
  inline Index rows() const { return derived().rows(); }
  /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
-  EIGEN_DEVICE_FUNC
  inline Index cols() const { return derived().cols(); }
  /** \returns the number of coefficients, which is rows()*cols().
    * \sa rows(), cols(), SizeAtCompileTime. */
-  EIGEN_DEVICE_FUNC
  inline Index size() const { return rows() * cols(); }

  /** \internal Don't use it, but do the equivalent: \code dst = *this; \endcode */
-  template<typename Dest>
-  EIGEN_DEVICE_FUNC
-  inline void evalTo(Dest& dst) const
+  template<typename Dest> inline void evalTo(Dest& dst) const
  { derived().evalTo(dst); }

  /** \internal Don't use it, but do the equivalent: \code dst += *this; \endcode */
-  template<typename Dest>
-  EIGEN_DEVICE_FUNC
-  inline void addTo(Dest& dst) const
+  template<typename Dest> inline void addTo(Dest& dst) const
  {
    // This is the default implementation,
    // derived class can reimplement it in a more optimized way.
@@ -74,9 +63,7 @@ template<typename Derived> struct EigenBase
  }

  /** \internal Don't use it, but do the equivalent: \code dst -= *this; \endcode */
-  template<typename Dest>
-  EIGEN_DEVICE_FUNC
-  inline void subTo(Dest& dst) const
+  template<typename Dest> inline void subTo(Dest& dst) const
  {
    // This is the default implementation,
    // derived class can reimplement it in a more optimized way.
@@ -86,8 +73,7 @@ template<typename Derived> struct EigenBase
  }

  /** \internal Don't use it, but do the equivalent: \code dst.applyOnTheRight(*this); \endcode */
-  template<typename Dest>
-  EIGEN_DEVICE_FUNC inline void applyThisOnTheRight(Dest& dst) const
+  template<typename Dest> inline void applyThisOnTheRight(Dest& dst) const
  {
    // This is the default implementation,
    // derived class can reimplement it in a more optimized way.
@@ -95,8 +81,7 @@ template<typename Derived> struct EigenBase
  }

  /** \internal Don't use it, but do the equivalent: \code dst.applyOnTheLeft(*this); \endcode */
-  template<typename Dest>
-  EIGEN_DEVICE_FUNC inline void applyThisOnTheLeft(Dest& dst) const
+  template<typename Dest> inline void applyThisOnTheLeft(Dest& dst) const
  {
    // This is the default implementation,
    // derived class can reimplement it in a more optimized way.
--- a/Eigen/src/Core/Functors.h
+++ b/Eigen/src/Core/Functors.h
--- a/Eigen/src/Core/Fuzzy.h
+++ b/Eigen/src/Core/Fuzzy.h
@@ -19,10 +19,9 @@ namespace internal
 template<typename Derived, typename OtherDerived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
 struct isApprox_selector
 {
-  EIGEN_DEVICE_FUNC
  static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar& prec)
  {
-    EIGEN_USING_STD_MATH(min);
+    using std::min;
    typename internal::nested<Derived,2>::type nested(x);
    typename internal::nested<OtherDerived,2>::type otherNested(y);
    return (nested - otherNested).cwiseAbs2().sum() <= prec * prec * (min)(nested.cwiseAbs2().sum(), otherNested.cwiseAbs2().sum());
@@ -32,7 +31,6 @@ struct isApprox_selector
 template<typename Derived, typename OtherDerived>
 struct isApprox_selector<Derived, OtherDerived, true>
 {
-  EIGEN_DEVICE_FUNC
  static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar&)
  {
    return x.matrix() == y.matrix();
@@ -42,7 +40,6 @@ struct isApprox_selector<Derived, OtherDerived, true>
 template<typename Derived, typename OtherDerived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
 struct isMuchSmallerThan_object_selector
 {
-  EIGEN_DEVICE_FUNC
  static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar& prec)
  {
    return x.cwiseAbs2().sum() <= numext::abs2(prec) * y.cwiseAbs2().sum();
@@ -52,7 +49,6 @@ struct isMuchSmallerThan_object_selector
 template<typename Derived, typename OtherDerived>
 struct isMuchSmallerThan_object_selector<Derived, OtherDerived, true>
 {
-  EIGEN_DEVICE_FUNC
  static bool run(const Derived& x, const OtherDerived&, const typename Derived::RealScalar&)
  {
    return x.matrix() == Derived::Zero(x.rows(), x.cols()).matrix();
@@ -62,7 +58,6 @@ struct isMuchSmallerThan_object_selector<Derived, OtherDerived, true>
 template<typename Derived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
 struct isMuchSmallerThan_scalar_selector
 {
-  EIGEN_DEVICE_FUNC
  static bool run(const Derived& x, const typename Derived::RealScalar& y, const typename Derived::RealScalar& prec)
  {
    return x.cwiseAbs2().sum() <= numext::abs2(prec * y);
@@ -72,7 +67,6 @@ struct isMuchSmallerThan_scalar_selector
 template<typename Derived>
 struct isMuchSmallerThan_scalar_selector<Derived, true>
 {
-  EIGEN_DEVICE_FUNC
  static bool run(const Derived& x, const typename Derived::RealScalar&, const typename Derived::RealScalar&)
  {
    return x.matrix() == Derived::Zero(x.rows(), x.cols()).matrix();
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -66,7 +66,8 @@ template<typename Lhs, typename Rhs> struct product_type
    MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::MaxColsAtCompileTime,
                                           _Rhs::MaxRowsAtCompileTime),
    Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime,
-                                        _Rhs::RowsAtCompileTime)
+                                        _Rhs::RowsAtCompileTime),
+    LargeThreshold = EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
  };

  // the splitting into different lines of code here, introducing the _select enums and the typedef below,
@@ -256,7 +257,7 @@ template<typename Lhs, typename Rhs>
 class GeneralProduct<Lhs, Rhs, OuterProduct>
  : public ProductBase<GeneralProduct<Lhs,Rhs,OuterProduct>, Lhs, Rhs>
 {
-    template<typename T> struct IsRowMajor : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
+    template<typename T> struct is_row_major : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
    
  public:
    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
@@ -280,22 +281,22 @@ class GeneralProduct<Lhs, Rhs, OuterProduct>
    
    template<typename Dest>
    inline void evalTo(Dest& dest) const {
-      internal::outer_product_selector_run(*this, dest, set(), IsRowMajor<Dest>());
+      internal::outer_product_selector_run(*this, dest, set(), is_row_major<Dest>());
    }
    
    template<typename Dest>
    inline void addTo(Dest& dest) const {
-      internal::outer_product_selector_run(*this, dest, add(), IsRowMajor<Dest>());
+      internal::outer_product_selector_run(*this, dest, add(), is_row_major<Dest>());
    }

    template<typename Dest>
    inline void subTo(Dest& dest) const {
-      internal::outer_product_selector_run(*this, dest, sub(), IsRowMajor<Dest>());
+      internal::outer_product_selector_run(*this, dest, sub(), is_row_major<Dest>());
    }

    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
    {
-      internal::outer_product_selector_run(*this, dest, adds(alpha), IsRowMajor<Dest>());
+      internal::outer_product_selector_run(*this, dest, adds(alpha), is_row_major<Dest>());
    }
 };

@@ -396,7 +397,7 @@ struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
  EIGEN_STRONG_INLINE Scalar* data() {
    return ForceAlignment
-            ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES)
+            ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(15))) + 16)
            : m_data.array;
  }
  #endif
@@ -445,7 +446,7 @@ template<> struct gemv_selector<OnTheRight,ColMajor,true>
    if(!evalToDest)
    {
      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      Index size = dest.size();
+      int size = dest.size();
      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
      #endif
      if(!alphaIsCompatible)
@@ -510,7 +511,7 @@ template<> struct gemv_selector<OnTheRight,RowMajor,true>
    if(!DirectlyUseRhs)
    {
      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      Index size = actualRhs.size();
+      int size = actualRhs.size();
      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
      #endif
      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
@@ -564,40 +565,6 @@ template<> struct gemv_selector<OnTheRight,RowMajor,false>
  *
  * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
  */
-#ifndef __CUDACC__
-
-#ifdef EIGEN_TEST_EVALUATORS
-template<typename Derived>
-template<typename OtherDerived>
-inline const Product<Derived, OtherDerived>
-MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
-{
-  // A note regarding the function declaration: In MSVC, this function will sometimes
-  // not be inlined since DenseStorage is an unwindable object for dynamic
-  // matrices and product types are holding a member to store the result.
-  // Thus it does not help tagging this function with EIGEN_STRONG_INLINE.
-  enum {
-    ProductIsValid =  Derived::ColsAtCompileTime==Dynamic
-                   || OtherDerived::RowsAtCompileTime==Dynamic
-                   || int(Derived::ColsAtCompileTime)==int(OtherDerived::RowsAtCompileTime),
-    AreVectors = Derived::IsVectorAtCompileTime && OtherDerived::IsVectorAtCompileTime,
-    SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(Derived,OtherDerived)
-  };
-  // note to the lost user:
-  //    * for a dot product use: v1.dot(v2)
-  //    * for a coeff-wise product use: v1.cwiseProduct(v2)
-  EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes),
-    INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS)
-  EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors),
-    INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
-  EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
-#ifdef EIGEN_DEBUG_PRODUCT
-  internal::product_type<Derived,OtherDerived>::debug();
-#endif
-  
-  return Product<Derived, OtherDerived>(derived(), other.derived());
-}
-#else
 template<typename Derived>
 template<typename OtherDerived>
 inline const typename ProductReturnType<Derived, OtherDerived>::Type
@@ -627,9 +594,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 #endif
  return typename ProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
 }
-#endif

-#endif
 /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
  *
  * The returned product will behave like any other expressions: the coefficients of the product will be
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -42,8 +42,6 @@ namespace internal {
 struct default_packet_traits
 {
  enum {
-    HasHalfPacket = 0,
-    
    HasAdd    = 1,
    HasSub    = 1,
    HasMul    = 1,
@@ -73,12 +71,10 @@ struct default_packet_traits
 template<typename T> struct packet_traits : default_packet_traits
 {
  typedef T type;
-  typedef T half;
  enum {
    Vectorizable = 0,
    size = 1,
-    AlignedOnScalar = 0,
-    HasHalfPacket = 0
+    AlignedOnScalar = 0
  };
  enum {
    HasAdd    = 0,
@@ -95,149 +91,94 @@ template<typename T> struct packet_traits : default_packet_traits
 };

 /** \internal \returns a + b (coeff-wise) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 padd(const Packet& a,
        const Packet& b) { return a+b; }

 /** \internal \returns a - b (coeff-wise) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 psub(const Packet& a,
        const Packet& b) { return a-b; }

 /** \internal \returns -a (coeff-wise) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pnegate(const Packet& a) { return -a; }

 /** \internal \returns conj(a) (coeff-wise) */
-
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pconj(const Packet& a) { return numext::conj(a); }

 /** \internal \returns a * b (coeff-wise) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pmul(const Packet& a,
        const Packet& b) { return a*b; }

 /** \internal \returns a / b (coeff-wise) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pdiv(const Packet& a,
        const Packet& b) { return a/b; }

 /** \internal \returns the min of \a a and \a b  (coeff-wise) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pmin(const Packet& a,
-        const Packet& b) { EIGEN_USING_STD_MATH(min); return (min)(a, b); }
+        const Packet& b) { using std::min; return (min)(a, b); }

 /** \internal \returns the max of \a a and \a b  (coeff-wise) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pmax(const Packet& a,
-        const Packet& b) { EIGEN_USING_STD_MATH(max); return (max)(a, b); }
+        const Packet& b) { using std::max; return (max)(a, b); }

 /** \internal \returns the absolute value of \a a */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pabs(const Packet& a) { using std::abs; return abs(a); }

 /** \internal \returns the bitwise and of \a a and \a b */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pand(const Packet& a, const Packet& b) { return a & b; }

 /** \internal \returns the bitwise or of \a a and \a b */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 por(const Packet& a, const Packet& b) { return a | b; }

 /** \internal \returns the bitwise xor of \a a and \a b */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pxor(const Packet& a, const Packet& b) { return a ^ b; }

 /** \internal \returns the bitwise andnot of \a a and \a b */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pandnot(const Packet& a, const Packet& b) { return a & (!b); }

 /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pload(const typename unpacket_traits<Packet>::type* from) { return *from; }

 /** \internal \returns a packet version of \a *from, (un-aligned load) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }

-/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
-
-/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pload1(const typename unpacket_traits<Packet>::type  *a) { return pset1<Packet>(*a); }
-
 /** \internal \returns a packet with elements of \a *from duplicated.
-  * For instance, for a packet of 8 elements, 4 scalars will be read from \a *from and
-  * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
+  * For instance, for a packet of 8 elements, 4 scalar will be read from \a *from and
+  * duplicated to form: {from[0],from[0],from[1],from[1],,from[2],from[2],,from[3],from[3]}
  * Currently, this function is only used for scalar * complex products.
-  */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+ */
+template<typename Packet> inline Packet
 ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }

-/** \internal \returns a packet with elements of \a *from quadrupled.
-  * For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and
-  * replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]}
-  * Currently, this function is only used in matrix products.
-  * For packet-size smaller or equal to 4, this function is equivalent to pload1 
-  */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-ploadquad(const typename unpacket_traits<Packet>::type* from)
-{ return pload1<Packet>(from); }
-
-/** \internal equivalent to
-  * \code
-  * a0 = pload1(a+0);
-  * a1 = pload1(a+1);
-  * a2 = pload1(a+2);
-  * a3 = pload1(a+3);
-  * \endcode
-  * \sa pset1, pload1, ploaddup, pbroadcast2
-  */
-template<typename Packet> EIGEN_DEVICE_FUNC
-inline void pbroadcast4(const typename unpacket_traits<Packet>::type *a,
-                        Packet& a0, Packet& a1, Packet& a2, Packet& a3)
-{
-  a0 = pload1<Packet>(a+0);
-  a1 = pload1<Packet>(a+1);
-  a2 = pload1<Packet>(a+2);
-  a3 = pload1<Packet>(a+3);
-}
-
-/** \internal equivalent to
-  * \code
-  * a0 = pload1(a+0);
-  * a1 = pload1(a+1);
-  * \endcode
-  * \sa pset1, pload1, ploaddup, pbroadcast4
-  */
-template<typename Packet> EIGEN_DEVICE_FUNC
-inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
-                        Packet& a0, Packet& a1)
-{
-  a0 = pload1<Packet>(a+0);
-  a1 = pload1<Packet>(a+1);
-}
+/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
+template<typename Packet> inline Packet
+pset1(const typename unpacket_traits<Packet>::type& a) { return a; }

 /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
 template<typename Scalar> inline typename packet_traits<Scalar>::type
 plset(const Scalar& a) { return a; }

 /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
-template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from)
+template<typename Scalar, typename Packet> inline void pstore(Scalar* to, const Packet& from)
 { (*to) = from; }

 /** \internal copy the packet \a from to \a *to, (un-aligned store) */
-template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
-{  (*to) = from; }
-
- template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, DenseIndex /*stride*/)
- { return ploadu<Packet>(from); }
-
- template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, DenseIndex /*stride*/)
- { pstore(to, from); }
+template<typename Scalar, typename Packet> inline void pstoreu(Scalar* to, const Packet& from)
+{ (*to) = from; }

 /** \internal tries to do cache prefetching of \a addr */
 template<typename Scalar> inline void prefetch(const Scalar* addr)
@@ -248,45 +189,36 @@ __builtin_prefetch(addr);
 }

 /** \internal \returns the first element of a packet */
-template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type pfirst(const Packet& a)
+template<typename Packet> inline typename unpacket_traits<Packet>::type pfirst(const Packet& a)
 { return a; }

 /** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 preduxp(const Packet* vecs) { return vecs[0]; }

 /** \internal \returns the sum of the elements of \a a*/
-template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
-{ return a; }
-
-/** \internal \returns the sum of the elements of \a a by block of 4 elements.
-  * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
-  * For packet-size smaller or equal to 4, this boils down to a noop.
-  */
-template<typename Packet> EIGEN_DEVICE_FUNC inline
-typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
-predux4(const Packet& a)
+template<typename Packet> inline typename unpacket_traits<Packet>::type predux(const Packet& a)
 { return a; }

 /** \internal \returns the product of the elements of \a a*/
-template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
+template<typename Packet> inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
 { return a; }

 /** \internal \returns the min of the elements of \a a*/
-template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
+template<typename Packet> inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
 { return a; }

 /** \internal \returns the max of the elements of \a a*/
-template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
+template<typename Packet> inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
 { return a; }

 /** \internal \returns the reversed elements of \a a*/
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
+template<typename Packet> inline Packet preverse(const Packet& a)
 { return a; }


 /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
+template<typename Packet> inline Packet pcplxflip(const Packet& a)
 {
  // FIXME: uncomment the following in case we drop the internal imag and real functions.
 //   using std::imag;
@@ -318,10 +250,6 @@ Packet pasin(const Packet& a) { using std::asin; return asin(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pacos(const Packet& a) { using std::acos; return acos(a); }

-/** \internal \returns the atan of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patan(const Packet& a) { using std::atan; return atan(a); }
-
 /** \internal \returns the exp of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pexp(const Packet& a) { using std::exp; return exp(a); }
@@ -347,7 +275,7 @@ inline void pstore1(typename unpacket_traits<Packet>::type* to, const typename u
 }

 /** \internal \returns a * b + c (coeff-wise) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> inline Packet
 pmadd(const Packet&  a,
         const Packet&  b,
         const Packet&  c)
@@ -408,33 +336,15 @@ inline void palign(PacketType& first, const PacketType& second)
 * Fast complex products (GCC generates a function call which is very slow)
 ***************************************************************************/

-// Eigen+CUDA does not support complexes.
-#ifndef __CUDACC__
-
 template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
 { return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }

 template<> inline std::complex<double> pmul(const std::complex<double>& a, const std::complex<double>& b)
 { return std::complex<double>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }

-#endif
-
-
-/***************************************************************************
- * PacketBlock, that is a collection of N packets where the number of words
- * in the packet is a multiple of N.
-***************************************************************************/
-template <typename Packet,int N=unpacket_traits<Packet>::size> struct PacketBlock {
-  Packet packet[N];
-};
-
-template<typename Packet> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet,1>& /*kernel*/) {
-  // Nothing to do in the scalar case, i.e. a 1x1 matrix.
-}
-
 } // end namespace internal

 } // end namespace Eigen

 #endif // EIGEN_GENERIC_PACKET_MATH_H
+
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -45,7 +45,6 @@ namespace Eigen
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan,scalar_atan_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op)
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@@ -49,7 +49,7 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
  */
 struct IOFormat
 {
-  /** Default constructor, see class IOFormat for the meaning of the parameters */
+  /** Default contructor, see class IOFormat for the meaning of the parameters */
  IOFormat(int _precision = StreamPrecision, int _flags = 0,
    const std::string& _coeffSeparator = " ",
    const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="",
@@ -57,10 +57,6 @@ struct IOFormat
  : matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator),
    rowSpacer(""), coeffSeparator(_coeffSeparator), precision(_precision), flags(_flags)
  {
-    // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline
-    // don't add rowSpacer if columns are not to be aligned
-    if((flags & DontAlignCols))
-      return;
    int i = int(matSuffix.length())-1;
    while (i>=0 && matSuffix[i]!='\n')
    {
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -88,7 +88,7 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
                        && ( bool(IsDynamicSize)
                           || HasNoOuterStride
                           || ( OuterStrideAtCompileTime!=Dynamic
-                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%EIGEN_ALIGN_BYTES)==0 ) ),
+                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%16)==0 ) ),
    Flags0 = TraitsBase::Flags & (~NestByRefBit),
    Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit),
    Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
@@ -110,17 +110,19 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
    EIGEN_DENSE_PUBLIC_INTERFACE(Map)

    typedef typename Base::PointerType PointerType;
+#if EIGEN2_SUPPORT_STAGE <= STAGE30_FULL_EIGEN3_API
+    typedef const Scalar* PointerArgType;
+    inline PointerType cast_to_pointer_type(PointerArgType ptr) { return const_cast<PointerType>(ptr); }
+#else
    typedef PointerType PointerArgType;
-    EIGEN_DEVICE_FUNC
    inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; }
+#endif

-    EIGEN_DEVICE_FUNC
    inline Index innerStride() const
    {
      return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
    }

-    EIGEN_DEVICE_FUNC
    inline Index outerStride() const
    {
      return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
@@ -134,7 +136,6 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
      * \param dataPtr pointer to the array to map
      * \param a_stride optional Stride object, passing the strides.
      */
-    EIGEN_DEVICE_FUNC
    inline Map(PointerArgType dataPtr, const StrideType& a_stride = StrideType())
      : Base(cast_to_pointer_type(dataPtr)), m_stride(a_stride)
    {
@@ -147,7 +148,6 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
      * \param a_size the size of the vector expression
      * \param a_stride optional Stride object, passing the strides.
      */
-    EIGEN_DEVICE_FUNC
    inline Map(PointerArgType dataPtr, Index a_size, const StrideType& a_stride = StrideType())
      : Base(cast_to_pointer_type(dataPtr), a_size), m_stride(a_stride)
    {
@@ -161,7 +161,6 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
      * \param nbCols the number of columns of the matrix expression
      * \param a_stride optional Stride object, passing the strides.
      */
-    EIGEN_DEVICE_FUNC
    inline Map(PointerArgType dataPtr, Index nbRows, Index nbCols, const StrideType& a_stride = StrideType())
      : Base(cast_to_pointer_type(dataPtr), nbRows, nbCols), m_stride(a_stride)
    {
@@ -174,6 +173,19 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
    StrideType m_stride;
 };

+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+inline Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>
+  ::Array(const Scalar *data)
+{
+  this->_set_noalias(Eigen::Map<const Array>(data));
+}
+
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+inline Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>
+  ::Matrix(const Scalar *data)
+{
+  this->_set_noalias(Eigen::Map<const Matrix>(data));
+}

 } // end namespace Eigen

--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -76,8 +76,8 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>

    typedef typename Base::CoeffReturnType CoeffReturnType;

-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_rows.value(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_cols.value(); }
+    inline Index rows() const { return m_rows.value(); }
+    inline Index cols() const { return m_cols.value(); }

    /** Returns a pointer to the first coefficient of the matrix or vector.
      *
@@ -87,26 +87,22 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      */
    inline const Scalar* data() const { return m_data; }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeff(Index rowId, Index colId) const
    {
      return m_data[colId * colStride() + rowId * rowStride()];
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeff(Index index) const
    {
      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
      return m_data[index * innerStride()];
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
      return this->m_data[colId * colStride() + rowId * rowStride()];
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
@@ -127,14 +123,12 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      return internal::ploadt<PacketScalar, LoadMode>(m_data + index * innerStride());
    }

-    EIGEN_DEVICE_FUNC
-    inline MapBase(PointerType dataPtr) : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
+    explicit inline MapBase(PointerType dataPtr) : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
    {
      EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
      checkSanity();
    }

-    EIGEN_DEVICE_FUNC
    inline MapBase(PointerType dataPtr, Index vecSize)
            : m_data(dataPtr),
              m_rows(RowsAtCompileTime == Dynamic ? vecSize : Index(RowsAtCompileTime)),
@@ -146,7 +140,6 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      checkSanity();
    }

-    EIGEN_DEVICE_FUNC
    inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols)
            : m_data(dataPtr), m_rows(nbRows), m_cols(nbCols)
    {
@@ -158,14 +151,13 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>

  protected:

-    EIGEN_DEVICE_FUNC
    void checkSanity() const
    {
      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(internal::traits<Derived>::Flags&PacketAccessBit,
                                        internal::inner_stride_at_compile_time<Derived>::ret==1),
                          PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
-      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % EIGEN_ALIGN_BYTES) == 0)
-                   && "data is not aligned");
+      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % 16) == 0)
+                   && "input pointer is not aligned on a 16 byte boundary");
    }

    PointerType m_data;
@@ -176,6 +168,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
 template<typename Derived> class MapBase<Derived, WriteAccessors>
  : public MapBase<Derived, ReadOnlyAccessors>
 {
+    typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase;
  public:

    typedef MapBase<Derived, ReadOnlyAccessors> Base;
@@ -203,18 +196,14 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
                    const Scalar
                  >::type ScalarWithConstIfNotLvalue;

-    EIGEN_DEVICE_FUNC
    inline const Scalar* data() const { return this->m_data; }
-    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue* data() { return this->m_data; } // no const-cast here so non-const-correct code will give a compile error

-    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue& coeffRef(Index row, Index col)
    {
      return this->m_data[col * colStride() + row * rowStride()];
    }

-    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue& coeffRef(Index index)
    {
      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
@@ -236,18 +225,19 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
                (this->m_data + index * innerStride(), val);
    }

-    EIGEN_DEVICE_FUNC explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {}
-    EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {}
-    EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols) : Base(dataPtr, nbRows, nbCols) {}
+    explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {}
+    inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {}
+    inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols) : Base(dataPtr, nbRows, nbCols) {}

-    EIGEN_DEVICE_FUNC
    Derived& operator=(const MapBase& other)
    {
-      Base::Base::operator=(other);
+      ReadOnlyMapBase::Base::operator=(other);
      return derived();
    }

-    using Base::Base::operator=;
+    // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base,
+    // see bugs 821 and 920.
+    using ReadOnlyMapBase::Base::operator=;
 };

 #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -12,15 +12,6 @@

 namespace Eigen {

-// On WINCE, std::abs is defined for int only, so let's defined our own overloads:
-// This issue has been confirmed with MSVC 2008 only, but the issue might exist for more recent versions too.
-#if defined(_WIN32_WCE) && defined(_MSC_VER) && _MSC_VER<=1500
-long        abs(long        x) { return (labs(x));  }
-double      abs(double      x) { return (fabs(x));  }
-float       abs(float       x) { return (fabsf(x)); }
-long double abs(long double x) { return (fabsl(x)); }
-#endif
-  
 namespace internal {

 /** \internal \struct global_math_functions_filtering_base
@@ -71,7 +62,6 @@ template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct real_default_impl
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
    return x;
@@ -82,7 +72,6 @@ template<typename Scalar>
 struct real_default_impl<Scalar,true>
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
    using std::real;
@@ -98,6 +87,7 @@ struct real_retval
  typedef typename NumTraits<Scalar>::Real type;
 };

+
 /****************************************************************************
 * Implementation of imag                                                 *
 ****************************************************************************/
@@ -106,7 +96,6 @@ template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct imag_default_impl
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar&)
  {
    return RealScalar(0);
@@ -117,7 +106,6 @@ template<typename Scalar>
 struct imag_default_impl<Scalar,true>
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
    using std::imag;
@@ -141,12 +129,10 @@ template<typename Scalar>
 struct real_ref_impl
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
  static inline RealScalar& run(Scalar& x)
  {
    return reinterpret_cast<RealScalar*>(&x)[0];
  }
-  EIGEN_DEVICE_FUNC
  static inline const RealScalar& run(const Scalar& x)
  {
    return reinterpret_cast<const RealScalar*>(&x)[0];
@@ -167,12 +153,10 @@ template<typename Scalar, bool IsComplex>
 struct imag_ref_default_impl
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
  static inline RealScalar& run(Scalar& x)
  {
    return reinterpret_cast<RealScalar*>(&x)[1];
  }
-  EIGEN_DEVICE_FUNC
  static inline const RealScalar& run(const Scalar& x)
  {
    return reinterpret_cast<RealScalar*>(&x)[1];
@@ -182,12 +166,10 @@ struct imag_ref_default_impl
 template<typename Scalar>
 struct imag_ref_default_impl<Scalar, false>
 {
-  EIGEN_DEVICE_FUNC
  static inline Scalar run(Scalar&)
  {
    return Scalar(0);
  }
-  EIGEN_DEVICE_FUNC
  static inline const Scalar run(const Scalar&)
  {
    return Scalar(0);
@@ -210,7 +192,6 @@ struct imag_ref_retval
 template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct conj_impl
 {
-  EIGEN_DEVICE_FUNC
  static inline Scalar run(const Scalar& x)
  {
    return x;
@@ -220,7 +201,6 @@ struct conj_impl
 template<typename Scalar>
 struct conj_impl<Scalar,true>
 {
-  EIGEN_DEVICE_FUNC
  static inline Scalar run(const Scalar& x)
  {
    using std::conj;
@@ -242,7 +222,6 @@ template<typename Scalar>
 struct abs2_impl
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
    return x*x;
@@ -252,7 +231,6 @@ struct abs2_impl
 template<typename RealScalar>
 struct abs2_impl<std::complex<RealScalar> >
 {
-  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const std::complex<RealScalar>& x)
  {
    return real(x)*real(x) + imag(x)*imag(x);
@@ -273,7 +251,6 @@ template<typename Scalar, bool IsComplex>
 struct norm1_default_impl
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
    using std::abs;
@@ -284,7 +261,6 @@ struct norm1_default_impl
 template<typename Scalar>
 struct norm1_default_impl<Scalar, false>
 {
-  EIGEN_DEVICE_FUNC
  static inline Scalar run(const Scalar& x)
  {
    using std::abs;
@@ -311,23 +287,16 @@ struct hypot_impl
  typedef typename NumTraits<Scalar>::Real RealScalar;
  static inline RealScalar run(const Scalar& x, const Scalar& y)
  {
-    EIGEN_USING_STD_MATH(max);
-    EIGEN_USING_STD_MATH(min);
+    using std::max;
+    using std::min;
    using std::abs;
    using std::sqrt;
    RealScalar _x = abs(x);
    RealScalar _y = abs(y);
-    Scalar p, qp;
-    if(_x>_y)
-    {
-      p = _x;
-      qp = _y / p;
-    }
-    else
-    {
-      p = _y;
-      qp = _x / p;
-    }
+    RealScalar p = (max)(_x, _y);
+    if(p==RealScalar(0)) return RealScalar(0);
+    RealScalar q = (min)(_x, _y);
+    RealScalar qp = q/p;
    return p * sqrt(RealScalar(1) + qp*qp);
  }
 };
@@ -363,45 +332,37 @@ inline NewType cast(const OldType& x)
 * Implementation of atanh2                                                *
 ****************************************************************************/

-template<typename Scalar>
-struct atanh2_impl
+template<typename Scalar, bool IsInteger>
+struct atanh2_default_impl
 {
-  static inline Scalar run(const Scalar& x, const Scalar& r)
-  {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-    #if (__cplusplus >= 201103L) && !defined(__CYGWIN__)
-      using std::log1p;
-      return log1p(2 * x / (r - x)) / 2;
-    #else
-      using std::abs;
-      using std::log;
-      using std::sqrt;
-      Scalar z = x / r;
-      if (r == 0 || abs(z) > sqrt(NumTraits<Scalar>::epsilon()))
-        return log((r + x) / (r - x)) / 2;
-      else
-        return z + z*z*z / 3;
-    #endif
-  }
-};
-
-template<typename RealScalar>
-struct atanh2_impl<std::complex<RealScalar> >
-{
-  typedef std::complex<RealScalar> Scalar;
-  static inline Scalar run(const Scalar& x, const Scalar& r)
+  typedef Scalar retval;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static inline Scalar run(const Scalar& x, const Scalar& y)
  {
+    using std::abs;
    using std::log;
-    using std::norm;
    using std::sqrt;
-    Scalar z = x / r;
-    if (r == Scalar(0) || norm(z) > NumTraits<RealScalar>::epsilon())
-      return RealScalar(0.5) * log((r + x) / (r - x));
+    Scalar z = x / y;
+    if (y == Scalar(0) || abs(z) > sqrt(NumTraits<RealScalar>::epsilon()))
+      return RealScalar(0.5) * log((y + x) / (y - x));
    else
      return z + z*z*z / RealScalar(3);
  }
 };

+template<typename Scalar>
+struct atanh2_default_impl<Scalar, true>
+{
+  static inline Scalar run(const Scalar&, const Scalar&)
+  {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+    return Scalar(0);
+  }
+};
+
+template<typename Scalar>
+struct atanh2_impl : atanh2_default_impl<Scalar, NumTraits<Scalar>::IsInteger> {};
+
 template<typename Scalar>
 struct atanh2_retval
 {
@@ -593,84 +554,72 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random()
 namespace numext {

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(real, Scalar) real(const Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(real, Scalar)::run(x);
 }  

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline typename internal::add_const_on_value_type< EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) >::type real_ref(const Scalar& x)
 {
  return internal::real_ref_impl<Scalar>::run(x);
 }

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) real_ref(Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(real_ref, Scalar)::run(x);
 }

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(imag, Scalar) imag(const Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x);
 }

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline typename internal::add_const_on_value_type< EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) >::type imag_ref(const Scalar& x)
 {
  return internal::imag_ref_impl<Scalar>::run(x);
 }

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) imag_ref(Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(imag_ref, Scalar)::run(x);
 }

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(conj, Scalar) conj(const Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(conj, Scalar)::run(x);
 }

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
 }

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
 {
  return EIGEN_MATHFUNC_IMPL(norm1, Scalar)::run(x);
 }

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar& y)
 {
  return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
 }

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y)
 {
  return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y);
 }

 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(pow, Scalar) pow(const Scalar& x, const Scalar& y)
 {
  return EIGEN_MATHFUNC_IMPL(pow, Scalar)::run(x, y);
@@ -678,22 +627,11 @@ inline EIGEN_MATHFUNC_RETVAL(pow, Scalar) pow(const Scalar& x, const Scalar& y)

 // std::isfinite is non standard, so let's define our own version,
 // even though it is not very efficient.
-template<typename T>
-EIGEN_DEVICE_FUNC
-bool (isfinite)(const T& x)
+template<typename T> bool (isfinite)(const T& x)
 {
  return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest();
 }

-template<typename T>
-EIGEN_DEVICE_FUNC
-bool (isfinite)(const std::complex<T>& x)
-{
-  using std::real;
-  using std::imag;
-  return isfinite(real(x)) && isfinite(imag(x));
-}
-
 } // end namespace numext

 namespace internal {
@@ -711,20 +649,18 @@ template<typename Scalar>
 struct scalar_fuzzy_default_impl<Scalar, false, false>
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  template<typename OtherScalar> EIGEN_DEVICE_FUNC
+  template<typename OtherScalar>
  static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
  {
    using std::abs;
    return abs(x) <= abs(y) * prec;
  }
-  EIGEN_DEVICE_FUNC
  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
  {
-    EIGEN_USING_STD_MATH(min);
+    using std::min;
    using std::abs;
    return abs(x - y) <= (min)(abs(x), abs(y)) * prec;
  }
-  EIGEN_DEVICE_FUNC
  static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar& prec)
  {
    return x <= y || isApprox(x, y, prec);
@@ -735,17 +671,15 @@ template<typename Scalar>
 struct scalar_fuzzy_default_impl<Scalar, false, true>
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  template<typename OtherScalar> EIGEN_DEVICE_FUNC
+  template<typename OtherScalar>
  static inline bool isMuchSmallerThan(const Scalar& x, const Scalar&, const RealScalar&)
  {
    return x == Scalar(0);
  }
-  EIGEN_DEVICE_FUNC
  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar&)
  {
    return x == y;
  }
-  EIGEN_DEVICE_FUNC
  static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar&)
  {
    return x <= y;
@@ -763,7 +697,7 @@ struct scalar_fuzzy_default_impl<Scalar, true, false>
  }
  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
  {
-    EIGEN_USING_STD_MATH(min);
+    using std::min;
    return numext::abs2(x - y) <= (min)(numext::abs2(x), numext::abs2(y)) * prec * prec;
  }
 };
@@ -771,21 +705,21 @@ struct scalar_fuzzy_default_impl<Scalar, true, false>
 template<typename Scalar>
 struct scalar_fuzzy_impl : scalar_fuzzy_default_impl<Scalar, NumTraits<Scalar>::IsComplex, NumTraits<Scalar>::IsInteger> {};

-template<typename Scalar, typename OtherScalar> EIGEN_DEVICE_FUNC
+template<typename Scalar, typename OtherScalar>
 inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y,
                                   typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
 {
  return scalar_fuzzy_impl<Scalar>::template isMuchSmallerThan<OtherScalar>(x, y, precision);
 }

-template<typename Scalar> EIGEN_DEVICE_FUNC
+template<typename Scalar>
 inline bool isApprox(const Scalar& x, const Scalar& y,
                          typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
 {
  return scalar_fuzzy_impl<Scalar>::isApprox(x, y, precision);
 }

-template<typename Scalar> EIGEN_DEVICE_FUNC
+template<typename Scalar>
 inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y,
                                    typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
 {
@@ -808,19 +742,17 @@ template<> struct scalar_fuzzy_impl<bool>
 {
  typedef bool RealScalar;
  
-  template<typename OtherScalar> EIGEN_DEVICE_FUNC
+  template<typename OtherScalar>
  static inline bool isMuchSmallerThan(const bool& x, const bool&, const bool&)
  {
    return !x;
  }
  
-  EIGEN_DEVICE_FUNC
  static inline bool isApprox(bool x, bool y, bool)
  {
    return x == y;
  }

-  EIGEN_DEVICE_FUNC
  static inline bool isApproxOrLessThan(const bool& x, const bool& y, const bool&)
  {
    return (!x) || y;
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -151,7 +151,6 @@ class Matrix
      *
      * \callgraph
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix& operator=(const Matrix& other)
    {
      return Base::_set(other);
@@ -168,7 +167,6 @@ class Matrix
      * remain row-vectors and vectors remain vectors.
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix& operator=(const MatrixBase<OtherDerived>& other)
    {
      return Base::_set(other);
@@ -181,14 +179,12 @@ class Matrix
      * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix& operator=(const EigenBase<OtherDerived> &other)
    {
      return Base::operator=(other);
    }

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix& operator=(const ReturnByValue<OtherDerived>& func)
    {
      return Base::operator=(func);
@@ -204,7 +200,6 @@ class Matrix
      *
      * \sa resize(Index,Index)
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix() : Base()
    {
      Base::_check_template_params();
@@ -212,7 +207,6 @@ class Matrix
    }

    // FIXME is it still needed
-    EIGEN_DEVICE_FUNC
    Matrix(internal::constructor_without_unaligned_array_assert)
      : Base(internal::constructor_without_unaligned_array_assert())
    { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
@@ -232,65 +226,41 @@ class Matrix
    }
 #endif

-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-
-    // This constructor is for both 1x1 matrices and dynamic vectors
-    template<typename T>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE explicit Matrix(const T& x)
+    /** \brief Constructs a vector or row-vector with given dimension. \only_for_vectors
+      *
+      * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
+      * it is redundant to pass the dimension here, so it makes more sense to use the default
+      * constructor Matrix() instead.
+      */
+    EIGEN_STRONG_INLINE explicit Matrix(Index dim)
+      : Base(dim, RowsAtCompileTime == 1 ? 1 : dim, ColsAtCompileTime == 1 ? 1 : dim)
    {
      Base::_check_template_params();
-      Base::template _init1<T>(x);
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Matrix)
+      eigen_assert(dim >= 0);
+      eigen_assert(SizeAtCompileTime == Dynamic || SizeAtCompileTime == dim);
+      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
    }

+    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y)
    {
      Base::_check_template_params();
      Base::template _init2<T0,T1>(x, y);
    }
    #else
-    /** \brief Constructs a fixed-sized matrix initialized with coefficients starting at \a data */
-    EIGEN_DEVICE_FUNC
-    explicit Matrix(const Scalar *data);
-
-    /** \brief Constructs a vector or row-vector with given dimension. \only_for_vectors
-      *
-      * This is useful for dynamic-size vectors. For fixed-size vectors,
-      * it is redundant to pass these parameters, so one should use the default constructor
-      * Matrix() instead.
-      * 
-      * \warning This constructor is disabled for fixed-size \c 1x1 matrices. For instance,
-      * calling Matrix<double,1,1>(1) will call the initialization constructor: Matrix(const Scalar&).
-      * For fixed-size \c 1x1 matrices it is thefore recommended to use the default
-      * constructor Matrix() instead, especilly when using one of the non standard
-      * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
-      */
-    EIGEN_STRONG_INLINE explicit Matrix(Index dim);
-    /** \brief Constructs an initialized 1x1 matrix with the given coefficient */
-    Matrix(const Scalar& x);
    /** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns.
      *
      * This is useful for dynamic-size matrices. For fixed-size matrices,
      * it is redundant to pass these parameters, so one should use the default constructor
-      * Matrix() instead.
-      * 
-      * \warning This constructor is disabled for fixed-size \c 1x2 and \c 2x1 vectors. For instance,
-      * calling Matrix2f(2,1) will call the initialization constructor: Matrix(const Scalar& x, const Scalar& y).
-      * For fixed-size \c 1x2 or \c 2x1 vectors it is thefore recommended to use the default
-      * constructor Matrix() instead, especilly when using one of the non standard
-      * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
-      */
-    EIGEN_DEVICE_FUNC
+      * Matrix() instead. */
    Matrix(Index rows, Index cols);
-    
    /** \brief Constructs an initialized 2D vector with given coefficients */
    Matrix(const Scalar& x, const Scalar& y);
    #endif

    /** \brief Constructs an initialized 3D vector with given coefficients */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z)
    {
      Base::_check_template_params();
@@ -300,7 +270,6 @@ class Matrix
      m_storage.data()[2] = z;
    }
    /** \brief Constructs an initialized 4D vector with given coefficients */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w)
    {
      Base::_check_template_params();
@@ -311,10 +280,10 @@ class Matrix
      m_storage.data()[3] = w;
    }

+    explicit Matrix(const Scalar *data);

    /** \brief Constructor copying the value of the expression \a other */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const MatrixBase<OtherDerived>& other)
             : Base(other.rows() * other.cols(), other.rows(), other.cols())
    {
@@ -327,7 +296,6 @@ class Matrix
      Base::_set_noalias(other);
    }
    /** \brief Copy constructor */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const Matrix& other)
            : Base(other.rows() * other.cols(), other.rows(), other.cols())
    {
@@ -336,7 +304,6 @@ class Matrix
    }
    /** \brief Copy constructor with in-place evaluation */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const ReturnByValue<OtherDerived>& other)
    {
      Base::_check_template_params();
@@ -348,7 +315,6 @@ class Matrix
      * \sa MatrixBase::operator=(const EigenBase<OtherDerived>&)
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const EigenBase<OtherDerived> &other)
      : Base(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
    {
@@ -364,22 +330,26 @@ class Matrix
      * of same type it is enough to swap the data pointers.
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    void swap(MatrixBase<OtherDerived> const & other)
    { this->_swap(other.derived()); }

-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }
+    inline Index innerStride() const { return 1; }
+    inline Index outerStride() const { return this->innerSize(); }

    /////////// Geometry module ///////////

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    explicit Matrix(const RotationBase<OtherDerived,ColsAtCompileTime>& r);
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Matrix& operator=(const RotationBase<OtherDerived,ColsAtCompileTime>& r);

+    #ifdef EIGEN2_SUPPORT
+    template<typename OtherDerived>
+    explicit Matrix(const eigen2_RotationBase<OtherDerived,ColsAtCompileTime>& r);
+    template<typename OtherDerived>
+    Matrix& operator=(const eigen2_RotationBase<OtherDerived,ColsAtCompileTime>& r);
+    #endif
+
    // allow to extend Matrix outside Eigen
    #ifdef EIGEN_MATRIX_PLUGIN
    #include EIGEN_MATRIX_PLUGIN
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -81,7 +81,6 @@ template<typename Derived> class MatrixBase
    using Base::operator-=;
    using Base::operator*=;
    using Base::operator/=;
-    using Base::operator*;

    typedef typename Base::CoeffReturnType CoeffReturnType;
    typedef typename Base::ConstTransposeReturnType ConstTransposeReturnType;
@@ -99,7 +98,6 @@ template<typename Derived> class MatrixBase

    /** \returns the size of the main diagonal, which is min(rows(),cols()).
      * \sa rows(), cols(), SizeAtCompileTime. */
-    EIGEN_DEVICE_FUNC
    inline Index diagonalSize() const { return (std::min)(rows(),cols()); }

    /** \brief The plain matrix type corresponding to this expression.
@@ -147,59 +145,36 @@ template<typename Derived> class MatrixBase
    /** Special case of the template operator=, in order to prevent the compiler
      * from generating a default operator= (issue hit with g++ 4.1)
      */
-    EIGEN_DEVICE_FUNC
    Derived& operator=(const MatrixBase& other);

    // We cannot inherit here via Base::operator= since it is causing
    // trouble with MSVC.

    template <typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator=(const DenseBase<OtherDerived>& other);

    template <typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator=(const EigenBase<OtherDerived>& other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator=(const ReturnByValue<OtherDerived>& other);

-#ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_DEVICE_FUNC
    Derived& lazyAssign(const ProductBase<ProductDerived, Lhs,Rhs>& other);
-#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+    template<typename MatrixPower, typename Lhs, typename Rhs>
+    Derived& lazyAssign(const MatrixPowerProduct<MatrixPower, Lhs,Rhs>& other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator+=(const MatrixBase<OtherDerived>& other);
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    Derived& operator-=(const MatrixBase<OtherDerived>& other);

-#ifdef __CUDACC__
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    const typename LazyProductReturnType<Derived,OtherDerived>::Type
-    operator*(const MatrixBase<OtherDerived> &other) const
-    { return this->lazyProduct(other); }
-#else
-
-#ifdef EIGEN_TEST_EVALUATORS
-    template<typename OtherDerived>
-    const Product<Derived,OtherDerived>
-    operator*(const MatrixBase<OtherDerived> &other) const;
-#else
    template<typename OtherDerived>
    const typename ProductReturnType<Derived,OtherDerived>::Type
    operator*(const MatrixBase<OtherDerived> &other) const;
-#endif
-
-#endif

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
    const typename LazyProductReturnType<Derived,OtherDerived>::Type
    lazyProduct(const MatrixBase<OtherDerived> &other) const;

@@ -213,96 +188,84 @@ template<typename Derived> class MatrixBase
    void applyOnTheRight(const EigenBase<OtherDerived>& other);

    template<typename DiagonalDerived>
-    EIGEN_DEVICE_FUNC
    const DiagonalProduct<Derived, DiagonalDerived, OnTheRight>
    operator*(const DiagonalBase<DiagonalDerived> &diagonal) const;

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
    dot(const MatrixBase<OtherDerived>& other) const;

-    EIGEN_DEVICE_FUNC RealScalar squaredNorm() const;
-    EIGEN_DEVICE_FUNC RealScalar norm() const;
+    #ifdef EIGEN2_SUPPORT
+      template<typename OtherDerived>
+      Scalar eigen2_dot(const MatrixBase<OtherDerived>& other) const;
+    #endif
+
+    RealScalar squaredNorm() const;
+    RealScalar norm() const;
    RealScalar stableNorm() const;
    RealScalar blueNorm() const;
    RealScalar hypotNorm() const;
-    EIGEN_DEVICE_FUNC const PlainObject normalized() const;
-    EIGEN_DEVICE_FUNC void normalize();
+    const PlainObject normalized() const;
+    void normalize();

-    EIGEN_DEVICE_FUNC const AdjointReturnType adjoint() const;
-    EIGEN_DEVICE_FUNC void adjointInPlace();
+    const AdjointReturnType adjoint() const;
+    void adjointInPlace();

    typedef Diagonal<Derived> DiagonalReturnType;
-    EIGEN_DEVICE_FUNC
    DiagonalReturnType diagonal();
-    
    typedef typename internal::add_const<Diagonal<const Derived> >::type ConstDiagonalReturnType;
-    EIGEN_DEVICE_FUNC
    ConstDiagonalReturnType diagonal() const;

    template<int Index> struct DiagonalIndexReturnType { typedef Diagonal<Derived,Index> Type; };
    template<int Index> struct ConstDiagonalIndexReturnType { typedef const Diagonal<const Derived,Index> Type; };

-    template<int Index> 
-    EIGEN_DEVICE_FUNC
-    typename DiagonalIndexReturnType<Index>::Type diagonal();
-
-    template<int Index>
-    EIGEN_DEVICE_FUNC
-    typename ConstDiagonalIndexReturnType<Index>::Type diagonal() const;
-
-    // Note: The "MatrixBase::" prefixes are added to help MSVC9 to match these declarations with the later implementations.
-    // On the other hand they confuse MSVC8...
-    #if (defined _MSC_VER) && (_MSC_VER >= 1500) // 2008 or later
-    typename MatrixBase::template DiagonalIndexReturnType<DynamicIndex>::Type diagonal(Index index);
-    typename MatrixBase::template ConstDiagonalIndexReturnType<DynamicIndex>::Type diagonal(Index index) const;
-    #else
-    EIGEN_DEVICE_FUNC
-    typename DiagonalIndexReturnType<DynamicIndex>::Type diagonal(Index index);
+    template<int Index> typename DiagonalIndexReturnType<Index>::Type diagonal();
+    template<int Index> typename ConstDiagonalIndexReturnType<Index>::Type diagonal() const;
    
-    EIGEN_DEVICE_FUNC
-    typename ConstDiagonalIndexReturnType<DynamicIndex>::Type diagonal(Index index) const;
-    #endif
+    typedef Diagonal<Derived,DynamicIndex> DiagonalDynamicIndexReturnType;
+    typedef typename internal::add_const<Diagonal<const Derived,DynamicIndex> >::type ConstDiagonalDynamicIndexReturnType;
+
+    DiagonalDynamicIndexReturnType diagonal(Index index);
+    ConstDiagonalDynamicIndexReturnType diagonal(Index index) const;
+
+    #ifdef EIGEN2_SUPPORT
+    template<unsigned int Mode> typename internal::eigen2_part_return_type<Derived, Mode>::type part();
+    template<unsigned int Mode> const typename internal::eigen2_part_return_type<Derived, Mode>::type part() const;
+    
+    // huuuge hack. make Eigen2's matrix.part<Diagonal>() work in eigen3. Problem: Diagonal is now a class template instead
+    // of an integer constant. Solution: overload the part() method template wrt template parameters list.
+    template<template<typename T, int N> class U>
+    const DiagonalWrapper<ConstDiagonalReturnType> part() const
+    { return diagonal().asDiagonal(); }
+    #endif // EIGEN2_SUPPORT

    template<unsigned int Mode> struct TriangularViewReturnType { typedef TriangularView<Derived, Mode> Type; };
    template<unsigned int Mode> struct ConstTriangularViewReturnType { typedef const TriangularView<const Derived, Mode> Type; };

-    template<unsigned int Mode>
-    EIGEN_DEVICE_FUNC
-    typename TriangularViewReturnType<Mode>::Type triangularView();
-    template<unsigned int Mode>
-    EIGEN_DEVICE_FUNC
-    typename ConstTriangularViewReturnType<Mode>::Type triangularView() const;
+    template<unsigned int Mode> typename TriangularViewReturnType<Mode>::Type triangularView();
+    template<unsigned int Mode> typename ConstTriangularViewReturnType<Mode>::Type triangularView() const;

    template<unsigned int UpLo> struct SelfAdjointViewReturnType { typedef SelfAdjointView<Derived, UpLo> Type; };
    template<unsigned int UpLo> struct ConstSelfAdjointViewReturnType { typedef const SelfAdjointView<const Derived, UpLo> Type; };

-    template<unsigned int UpLo> 
-    EIGEN_DEVICE_FUNC
-    typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
-    template<unsigned int UpLo>
-    EIGEN_DEVICE_FUNC
-    typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;
+    template<unsigned int UpLo> typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
+    template<unsigned int UpLo> typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;

    const SparseView<Derived> sparseView(const Scalar& m_reference = Scalar(0),
                                         const typename NumTraits<Scalar>::Real& m_epsilon = NumTraits<Scalar>::dummy_precision()) const;
-    EIGEN_DEVICE_FUNC static const IdentityReturnType Identity();
-    EIGEN_DEVICE_FUNC static const IdentityReturnType Identity(Index rows, Index cols);
-    EIGEN_DEVICE_FUNC static const BasisReturnType Unit(Index size, Index i);
-    EIGEN_DEVICE_FUNC static const BasisReturnType Unit(Index i);
-    EIGEN_DEVICE_FUNC static const BasisReturnType UnitX();
-    EIGEN_DEVICE_FUNC static const BasisReturnType UnitY();
-    EIGEN_DEVICE_FUNC static const BasisReturnType UnitZ();
-    EIGEN_DEVICE_FUNC static const BasisReturnType UnitW();
+    static const IdentityReturnType Identity();
+    static const IdentityReturnType Identity(Index rows, Index cols);
+    static const BasisReturnType Unit(Index size, Index i);
+    static const BasisReturnType Unit(Index i);
+    static const BasisReturnType UnitX();
+    static const BasisReturnType UnitY();
+    static const BasisReturnType UnitZ();
+    static const BasisReturnType UnitW();

-    EIGEN_DEVICE_FUNC
    const DiagonalWrapper<const Derived> asDiagonal() const;
    const PermutationWrapper<const Derived> asPermutation() const;

-    EIGEN_DEVICE_FUNC
    Derived& setIdentity();
-    EIGEN_DEVICE_FUNC
    Derived& setIdentity(Index rows, Index cols);

    bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
@@ -341,26 +304,42 @@ template<typename Derived> class MatrixBase

    Scalar trace() const;

-    template<int p> EIGEN_DEVICE_FUNC RealScalar lpNorm() const;
+/////////// Array module ///////////

-    EIGEN_DEVICE_FUNC MatrixBase<Derived>& matrix() { return *this; }
-    EIGEN_DEVICE_FUNC const MatrixBase<Derived>& matrix() const { return *this; }
+    template<int p> RealScalar lpNorm() const;
+
+    MatrixBase<Derived>& matrix() { return *this; }
+    const MatrixBase<Derived>& matrix() const { return *this; }

    /** \returns an \link Eigen::ArrayBase Array \endlink expression of this matrix
      * \sa ArrayBase::matrix() */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ArrayWrapper<Derived> array() { return derived(); }
-    /** \returns a const \link Eigen::ArrayBase Array \endlink expression of this matrix
-      * \sa ArrayBase::matrix() */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ArrayWrapper<const Derived> array() const { return derived(); }
+    ArrayWrapper<Derived> array() { return derived(); }
+    const ArrayWrapper<const Derived> array() const { return derived(); }

 /////////// LU module ///////////

-    EIGEN_DEVICE_FUNC const FullPivLU<PlainObject> fullPivLu() const;
-    EIGEN_DEVICE_FUNC const PartialPivLU<PlainObject> partialPivLu() const;
+    const FullPivLU<PlainObject> fullPivLu() const;
+    const PartialPivLU<PlainObject> partialPivLu() const;

+    #if EIGEN2_SUPPORT_STAGE < STAGE20_RESOLVE_API_CONFLICTS
+    const LU<PlainObject> lu() const;
+    #endif
+
+    #ifdef EIGEN2_SUPPORT
+    const LU<PlainObject> eigen2_lu() const;
+    #endif
+
+    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
    const PartialPivLU<PlainObject> lu() const;
+    #endif
+    
+    #ifdef EIGEN2_SUPPORT
+    template<typename ResultType>
+    void computeInverse(MatrixBase<ResultType> *result) const {
+      *result = this->inverse();
+    }
+    #endif

-    EIGEN_DEVICE_FUNC
    const internal::inverse_impl<Derived> inverse() const;
    template<typename ResultType>
    void computeInverseAndDetWithCheck(
@@ -387,6 +366,10 @@ template<typename Derived> class MatrixBase
    const HouseholderQR<PlainObject> householderQr() const;
    const ColPivHouseholderQR<PlainObject> colPivHouseholderQr() const;
    const FullPivHouseholderQR<PlainObject> fullPivHouseholderQr() const;
+    
+    #ifdef EIGEN2_SUPPORT
+    const QR<PlainObject> qr() const;
+    #endif

    EigenvaluesReturnType eigenvalues() const;
    RealScalar operatorNorm() const;
@@ -395,6 +378,10 @@ template<typename Derived> class MatrixBase

    JacobiSVD<PlainObject> jacobiSvd(unsigned int computationOptions = 0) const;

+    #ifdef EIGEN2_SUPPORT
+    SVD<PlainObject> svd() const;
+    #endif
+
 /////////// Geometry module ///////////

    #ifndef EIGEN_PARSED_BY_DOXYGEN
@@ -405,24 +392,20 @@ template<typename Derived> class MatrixBase
    };
    #endif // EIGEN_PARSED_BY_DOXYGEN
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    typename cross_product_return_type<OtherDerived>::type
    cross(const MatrixBase<OtherDerived>& other) const;
-    
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    PlainObject cross3(const MatrixBase<OtherDerived>& other) const;
-    
-    EIGEN_DEVICE_FUNC
    PlainObject unitOrthogonal(void) const;
-    
    Matrix<Scalar,3,1> eulerAngles(Index a0, Index a1, Index a2) const;
    
+    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
    ScalarMultipleReturnType operator*(const UniformScaling<Scalar>& s) const;
    // put this as separate enum value to work around possible GCC 4.3 bug (?)
    enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1?Vertical:Horizontal };
    typedef Homogeneous<Derived, HomogeneousReturnTypeDirection> HomogeneousReturnType;
    HomogeneousReturnType homogeneous() const;
+    #endif
    
    enum {
      SizeMinusOne = SizeAtCompileTime==Dynamic ? Dynamic : SizeAtCompileTime-1
@@ -457,6 +440,15 @@ template<typename Derived> class MatrixBase
    template<typename OtherScalar>
    void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);

+///////// SparseCore module /////////
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE const typename SparseMatrixBase<OtherDerived>::template CwiseProductDenseReturnType<Derived>::Type
+    cwiseProduct(const SparseMatrixBase<OtherDerived> &other) const
+    {
+      return other.cwiseProduct(derived());
+    }
+
 ///////// MatrixFunctions module /////////

    typedef typename internal::stem_function<Scalar>::type StemFunction;
@@ -469,15 +461,49 @@ template<typename Derived> class MatrixBase
    const MatrixSquareRootReturnValue<Derived> sqrt() const;
    const MatrixLogarithmReturnValue<Derived> log() const;
    const MatrixPowerReturnValue<Derived> pow(const RealScalar& p) const;
-    const MatrixComplexPowerReturnValue<Derived> pow(const std::complex<RealScalar>& p) const;
+
+#ifdef EIGEN2_SUPPORT
+    template<typename ProductDerived, typename Lhs, typename Rhs>
+    Derived& operator+=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
+                                      EvalBeforeAssigningBit>& other);
+
+    template<typename ProductDerived, typename Lhs, typename Rhs>
+    Derived& operator-=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
+                                      EvalBeforeAssigningBit>& other);
+
+    /** \deprecated because .lazy() is deprecated
+      * Overloaded for cache friendly product evaluation */
+    template<typename OtherDerived>
+    Derived& lazyAssign(const Flagged<OtherDerived, 0, EvalBeforeAssigningBit>& other)
+    { return lazyAssign(other._expression()); }
+
+    template<unsigned int Added>
+    const Flagged<Derived, Added, 0> marked() const;
+    const Flagged<Derived, 0, EvalBeforeAssigningBit> lazy() const;
+
+    inline const Cwise<Derived> cwise() const;
+    inline Cwise<Derived> cwise();
+
+    VectorBlock<Derived> start(Index size);
+    const VectorBlock<const Derived> start(Index size) const;
+    VectorBlock<Derived> end(Index size);
+    const VectorBlock<const Derived> end(Index size) const;
+    template<int Size> VectorBlock<Derived,Size> start();
+    template<int Size> const VectorBlock<const Derived,Size> start() const;
+    template<int Size> VectorBlock<Derived,Size> end();
+    template<int Size> const VectorBlock<const Derived,Size> end() const;
+
+    Minor<Derived> minor(Index row, Index col);
+    const Minor<Derived> minor(Index row, Index col) const;
+#endif

  protected:
-    EIGEN_DEVICE_FUNC MatrixBase() : Base() {}
+    MatrixBase() : Base() {}

  private:
-    EIGEN_DEVICE_FUNC explicit MatrixBase(int);
-    EIGEN_DEVICE_FUNC MatrixBase(int,int);
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC explicit MatrixBase(const MatrixBase<OtherDerived>&);
+    explicit MatrixBase(int);
+    MatrixBase(int,int);
+    template<typename OtherDerived> explicit MatrixBase(const MatrixBase<OtherDerived>&);
  protected:
    // mixing arrays and matrices is not legal
    template<typename OtherDerived> Derived& operator+=(const ArrayBase<OtherDerived>& )
--- a/Eigen/src/Core/NoAlias.h
+++ b/Eigen/src/Core/NoAlias.h
@@ -37,13 +37,11 @@ class NoAlias
    /** Behaves like MatrixBase::lazyAssign(other)
      * \sa MatrixBase::lazyAssign() */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator=(const StorageBase<OtherDerived>& other)
    { return internal::assign_selector<ExpressionType,OtherDerived,false>::run(m_expression,other.derived()); }

    /** \sa MatrixBase::operator+= */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator+=(const StorageBase<OtherDerived>& other)
    {
      typedef SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, ExpressionType, OtherDerived> SelfAdder;
@@ -56,7 +54,6 @@ class NoAlias

    /** \sa MatrixBase::operator-= */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator-=(const StorageBase<OtherDerived>& other)
    {
      typedef SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, ExpressionType, OtherDerived> SelfAdder;
@@ -69,12 +66,10 @@ class NoAlias

 #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator+=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
    { other.derived().addTo(m_expression); return m_expression; }

    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator-=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
    { other.derived().subTo(m_expression); return m_expression; }

@@ -83,7 +78,6 @@ class NoAlias
    { return m_expression.derived() += CoeffBasedProduct<Lhs,Rhs,NestByRefBit>(other.lhs(), other.rhs()); }

    template<typename Lhs, typename Rhs, int NestingFlags>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator-=(const CoeffBasedProduct<Lhs,Rhs,NestingFlags>& other)
    { return m_expression.derived() -= CoeffBasedProduct<Lhs,Rhs,NestByRefBit>(other.lhs(), other.rhs()); }
    
@@ -92,7 +86,6 @@ class NoAlias
    { return m_expression = func; }
 #endif

-    EIGEN_DEVICE_FUNC
    ExpressionType& expression() const
    {
      return m_expression;
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -68,16 +68,7 @@ template<typename T> struct GenericNumTraits
                   >::type NonInteger;
  typedef T Nested;

-  EIGEN_DEVICE_FUNC
-  static inline Real epsilon()
-  {
-    #if defined(__CUDA_ARCH__)
-    return internal::device::numeric_limits<T>::epsilon();
-    #else
-    return std::numeric_limits<T>::epsilon();
-    #endif
-  }
-  EIGEN_DEVICE_FUNC
+  static inline Real epsilon() { return std::numeric_limits<T>::epsilon(); }
  static inline Real dummy_precision()
  {
    // make sure to override this for floating-point types
@@ -85,6 +76,13 @@ template<typename T> struct GenericNumTraits
  }
  static inline T highest() { return (std::numeric_limits<T>::max)(); }
  static inline T lowest()  { return IsInteger ? (std::numeric_limits<T>::min)() : (-(std::numeric_limits<T>::max)()); }
+  
+#ifdef EIGEN2_SUPPORT
+  enum {
+    HasFloatingPoint = !IsInteger
+  };
+  typedef NonInteger FloatingPoint;
+#endif
 };

 template<typename T> struct NumTraits : GenericNumTraits<T>
@@ -93,13 +91,11 @@ template<typename T> struct NumTraits : GenericNumTraits<T>
 template<> struct NumTraits<float>
  : GenericNumTraits<float>
 {
-  EIGEN_DEVICE_FUNC
  static inline float dummy_precision() { return 1e-5f; }
 };

 template<> struct NumTraits<double> : GenericNumTraits<double>
 {
-  EIGEN_DEVICE_FUNC
  static inline double dummy_precision() { return 1e-12; }
 };

--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@@ -66,11 +66,11 @@ class PermutationBase : public EigenBase<Derived>
      MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
      MaxColsAtCompileTime = Traits::MaxColsAtCompileTime
    };
-    typedef typename Traits::StorageIndexType StorageIndexType;
+    typedef typename Traits::Scalar Scalar;
    typedef typename Traits::Index Index;
-    typedef Matrix<StorageIndexType,RowsAtCompileTime,ColsAtCompileTime,0,MaxRowsAtCompileTime,MaxColsAtCompileTime>
+    typedef Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime,0,MaxRowsAtCompileTime,MaxColsAtCompileTime>
            DenseMatrixType;
-    typedef PermutationMatrix<IndicesType::SizeAtCompileTime,IndicesType::MaxSizeAtCompileTime,StorageIndexType>
+    typedef PermutationMatrix<IndicesType::SizeAtCompileTime,IndicesType::MaxSizeAtCompileTime,Index>
            PlainPermutationType;
    using Base::derived;
    #endif
@@ -147,7 +147,7 @@ class PermutationBase : public EigenBase<Derived>
    /** Sets *this to be the identity permutation matrix */
    void setIdentity()
    {
-      for(StorageIndexType i = 0; i < size(); ++i)
+      for(Index i = 0; i < size(); ++i)
        indices().coeffRef(i) = i;
    }

@@ -173,8 +173,8 @@ class PermutationBase : public EigenBase<Derived>
      eigen_assert(i>=0 && j>=0 && i<size() && j<size());
      for(Index k = 0; k < size(); ++k)
      {
-        if(indices().coeff(k) == i) indices().coeffRef(k) = StorageIndexType(j);
-        else if(indices().coeff(k) == j) indices().coeffRef(k) = StorageIndexType(i);
+        if(indices().coeff(k) == i) indices().coeffRef(k) = j;
+        else if(indices().coeff(k) == j) indices().coeffRef(k) = i;
      }
      return derived();
    }
@@ -250,6 +250,35 @@ class PermutationBase : public EigenBase<Derived>
    template<typename Other> friend
    inline PlainPermutationType operator*(const Transpose<PermutationBase<Other> >& other, const PermutationBase& perm)
    { return PlainPermutationType(internal::PermPermProduct, other.eval(), perm); }
+    
+    /** \returns the determinant of the permutation matrix, which is either 1 or -1 depending on the parity of the permutation.
+      *
+      * This function is O(\c n) procedure allocating a buffer of \c n booleans.
+      */
+    Index determinant() const
+    {
+      Index res = 1;
+      Index n = size();
+      Matrix<bool,RowsAtCompileTime,1,0,MaxRowsAtCompileTime> mask(n);
+      mask.fill(false);
+      Index r = 0;
+      while(r < n)
+      {
+        // search for the next seed
+        while(r<n && mask[r]) r++;
+        if(r>=n)
+          break;
+        // we got one, let's follow it until we are back to the seed
+        Index k0 = r++;
+        mask.coeffRef(k0) = true;
+        for(Index k=indices().coeff(k0); k!=k0; k=indices().coeff(k))
+        {
+          mask.coeffRef(k) = true;
+          res = -res;
+        }
+      }
+      return res;
+    }

  protected:

@@ -262,7 +291,7 @@ class PermutationBase : public EigenBase<Derived>
  *
  * \param SizeAtCompileTime the number of rows/cols, or Dynamic
  * \param MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
-  * \param StorageIndexType the integer type of the indices
+  * \param IndexType the interger type of the indices
  *
  * This class represents a permutation matrix, internally stored as a vector of integers.
  *
@@ -270,18 +299,17 @@ class PermutationBase : public EigenBase<Derived>
  */

 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndexType>
-struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndexType> >
- : traits<Matrix<_StorageIndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
+struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType> >
+ : traits<Matrix<IndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
 {
-  typedef Matrix<_StorageIndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
-  typedef typename IndicesType::Index Index;
-  typedef _StorageIndexType StorageIndexType;
+  typedef IndexType Index;
+  typedef Matrix<IndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
 };
 }

-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndexType>
-class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndexType> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
+class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType> >
 {
    typedef PermutationBase<PermutationMatrix> Base;
    typedef internal::traits<PermutationMatrix> Traits;
@@ -289,8 +317,6 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile

    #ifndef EIGEN_PARSED_BY_DOXYGEN
    typedef typename Traits::IndicesType IndicesType;
-    typedef typename Traits::StorageIndexType StorageIndexType;
-    typedef typename Traits::Index Index;
    #endif

    inline PermutationMatrix()
@@ -298,7 +324,7 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile

    /** Constructs an uninitialized permutation matrix of given size.
      */
-    inline PermutationMatrix(Index size) : m_indices(size)
+    inline PermutationMatrix(int size) : m_indices(size)
    {}

    /** Copy constructor. */
@@ -387,19 +413,18 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile


 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndexType, int _PacketAccess>
-struct traits<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndexType>,_PacketAccess> >
- : traits<Matrix<_StorageIndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
+struct traits<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess> >
+ : traits<Matrix<IndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
 {
-  typedef Map<const Matrix<_StorageIndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1>, _PacketAccess> IndicesType;
-  typedef typename IndicesType::Index Index;
-  typedef _StorageIndexType StorageIndexType;
+  typedef IndexType Index;
+  typedef Map<const Matrix<IndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1>, _PacketAccess> IndicesType;
 };
 }

-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndexType, int _PacketAccess>
-class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndexType>,_PacketAccess>
-  : public PermutationBase<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndexType>,_PacketAccess> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
+class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess>
+  : public PermutationBase<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess> >
 {
    typedef PermutationBase<Map> Base;
    typedef internal::traits<Map> Traits;
@@ -407,15 +432,14 @@ class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageInd

    #ifndef EIGEN_PARSED_BY_DOXYGEN
    typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar StorageIndexType;
-    typedef typename IndicesType::Index  Index;
+    typedef typename IndicesType::Scalar Index;
    #endif

-    inline Map(const StorageIndexType* indicesPtr)
+    inline Map(const Index* indicesPtr)
      : m_indices(indicesPtr)
    {}

-    inline Map(const StorageIndexType* indicesPtr, Index size)
+    inline Map(const Index* indicesPtr, Index size)
      : m_indices(indicesPtr,size)
    {}

@@ -471,8 +495,7 @@ struct traits<PermutationWrapper<_IndicesType> >
 {
  typedef PermutationStorage StorageKind;
  typedef typename _IndicesType::Scalar Scalar;
-  typedef typename _IndicesType::Scalar StorageIndexType;
-  typedef typename _IndicesType::Index Index;
+  typedef typename _IndicesType::Scalar Index;
  typedef _IndicesType IndicesType;
  enum {
    RowsAtCompileTime = _IndicesType::SizeAtCompileTime,
@@ -561,7 +584,10 @@ struct permut_matrix_product_retval
      const Index n = Side==OnTheLeft ? rows() : cols();
      // FIXME we need an is_same for expression that is not sensitive to constness. For instance
      // is_same_xpr<Block<const Matrix>, Block<Matrix> >::value should be true.
-      if(is_same<MatrixTypeNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_matrix))
+      if(    is_same<MatrixTypeNestedCleaned,Dest>::value
+          && blas_traits<MatrixTypeNestedCleaned>::HasUsableDirectAccess
+          && blas_traits<Dest>::HasUsableDirectAccess
+          && extract_data(dst) == extract_data(m_matrix))
      {
        // apply the permutation inplace
        Matrix<bool,PermutationType::RowsAtCompileTime,1,0,PermutationType::MaxRowsAtCompileTime> mask(m_permutation.size());
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -28,7 +28,6 @@ namespace internal {

 template<int MaxSizeAtCompileTime> struct check_rows_cols_for_overflow {
  template<typename Index>
-  EIGEN_DEVICE_FUNC
  static EIGEN_ALWAYS_INLINE void run(Index, Index)
  {
  }
@@ -36,7 +35,6 @@ template<int MaxSizeAtCompileTime> struct check_rows_cols_for_overflow {

 template<> struct check_rows_cols_for_overflow<Dynamic> {
  template<typename Index>
-  EIGEN_DEVICE_FUNC
  static EIGEN_ALWAYS_INLINE void run(Index rows, Index cols)
  {
    // http://hg.mozilla.org/mozilla-central/file/6c8a909977d3/xpcom/ds/CheckedInt.h#l242
@@ -131,17 +129,12 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    enum { NeedsToAlign = SizeAtCompileTime != Dynamic && (internal::traits<Derived>::Flags & AlignedBit) != 0 };
    EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)

-    EIGEN_DEVICE_FUNC
    Base& base() { return *static_cast<Base*>(this); }
-    EIGEN_DEVICE_FUNC
    const Base& base() const { return *static_cast<const Base*>(this); }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index rows() const { return m_storage.rows(); }
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index cols() const { return m_storage.cols(); }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar& coeff(Index rowId, Index colId) const
    {
      if(Flags & RowMajorBit)
@@ -150,13 +143,11 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
        return m_storage.data()[rowId + colId * m_storage.rows()];
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
    {
      return m_storage.data()[index];
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar& coeffRef(Index rowId, Index colId)
    {
      if(Flags & RowMajorBit)
@@ -165,13 +156,11 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
        return m_storage.data()[rowId + colId * m_storage.rows()];
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
    {
      return m_storage.data()[index];
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar& coeffRef(Index rowId, Index colId) const
    {
      if(Flags & RowMajorBit)
@@ -180,7 +169,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
        return m_storage.data()[rowId + colId * m_storage.rows()];
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar& coeffRef(Index index) const
    {
      return m_storage.data()[index];
@@ -244,7 +232,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * \sa resize(Index) for vectors, resize(NoChange_t, Index), resize(Index, NoChange_t)
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void resize(Index nbRows, Index nbCols)
    {
      eigen_assert(   EIGEN_IMPLIES(RowsAtCompileTime!=Dynamic,nbRows==RowsAtCompileTime)
@@ -275,7 +262,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * \sa resize(Index,Index), resize(NoChange_t, Index), resize(Index, NoChange_t)
      */
-    EIGEN_DEVICE_FUNC
    inline void resize(Index size)
    {
      EIGEN_STATIC_ASSERT_VECTOR_ONLY(PlainObjectBase)
@@ -300,7 +286,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * \sa resize(Index,Index)
      */
-    EIGEN_DEVICE_FUNC
    inline void resize(NoChange_t, Index nbCols)
    {
      resize(rows(), nbCols);
@@ -314,7 +299,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * \sa resize(Index,Index)
      */
-    EIGEN_DEVICE_FUNC
    inline void resize(Index nbRows, NoChange_t)
    {
      resize(nbRows, cols());
@@ -328,7 +312,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * remain row-vectors and vectors remain vectors.
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE void resizeLike(const EigenBase<OtherDerived>& _other)
    {
      const OtherDerived& other = _other.derived();
@@ -356,7 +339,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * Matrices are resized relative to the top-left element. In case values need to be 
      * appended to the matrix they will be uninitialized.
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void conservativeResize(Index nbRows, Index nbCols)
    {
      internal::conservative_resize_like_impl<Derived>::run(*this, nbRows, nbCols);
@@ -369,7 +351,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * In case the matrix is growing, new rows will be uninitialized.
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void conservativeResize(Index nbRows, NoChange_t)
    {
      // Note: see the comment in conservativeResize(Index,Index)
@@ -383,7 +364,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * In case the matrix is growing, new columns will be uninitialized.
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void conservativeResize(NoChange_t, Index nbCols)
    {
      // Note: see the comment in conservativeResize(Index,Index)
@@ -398,7 +378,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * When values are appended, they will be uninitialized.
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void conservativeResize(Index size)
    {
      internal::conservative_resize_like_impl<Derived>::run(*this, size);
@@ -414,7 +393,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * appended to the matrix they will copied from \c other.
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void conservativeResizeLike(const DenseBase<OtherDerived>& other)
    {
      internal::conservative_resize_like_impl<Derived,OtherDerived>::run(*this, other);
@@ -423,7 +401,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    /** This is a special case of the templated operator=. Its purpose is to
      * prevent a default operator= from hiding the templated operator=.
      */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Derived& operator=(const PlainObjectBase& other)
    {
      return _set(other);
@@ -431,7 +408,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type

    /** \sa MatrixBase::lazyAssign() */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Derived& lazyAssign(const DenseBase<OtherDerived>& other)
    {
      _resize_to_match(other);
@@ -439,14 +415,12 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    }

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Derived& operator=(const ReturnByValue<OtherDerived>& func)
    {
      resize(func.rows(), func.cols());
      return Base::operator=(func);
    }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE PlainObjectBase() : m_storage()
    {
 //       _check_template_params();
@@ -456,7 +430,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
 #ifndef EIGEN_PARSED_BY_DOXYGEN
    // FIXME is it still needed ?
    /** \internal */
-    EIGEN_DEVICE_FUNC
    PlainObjectBase(internal::constructor_without_unaligned_array_assert)
      : m_storage(internal::constructor_without_unaligned_array_assert())
    {
@@ -465,13 +438,11 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
 #endif

 #ifdef EIGEN_HAVE_RVALUE_REFERENCES
-    EIGEN_DEVICE_FUNC
    PlainObjectBase(PlainObjectBase&& other)
      : m_storage( std::move(other.m_storage) )
    {
    }

-    EIGEN_DEVICE_FUNC
    PlainObjectBase& operator=(PlainObjectBase&& other)
    {
      using std::swap;
@@ -480,7 +451,22 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    }
 #endif

-    EIGEN_DEVICE_FUNC
+    /** Copy constructor */
+    EIGEN_STRONG_INLINE PlainObjectBase(const PlainObjectBase& other)
+      : m_storage()
+    {
+      _check_template_params();
+      lazyAssign(other);
+    }
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE PlainObjectBase(const DenseBase<OtherDerived> &other)
+      : m_storage()
+    {
+      _check_template_params();
+      lazyAssign(other);
+    }
+
    EIGEN_STRONG_INLINE PlainObjectBase(Index a_size, Index nbRows, Index nbCols)
      : m_storage(a_size, nbRows, nbCols)
    {
@@ -491,7 +477,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    /** \copydoc MatrixBase::operator=(const EigenBase<OtherDerived>&)
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived> &other)
    {
      _resize_to_match(other);
@@ -501,7 +486,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type

    /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE PlainObjectBase(const EigenBase<OtherDerived> &other)
      : m_storage(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
    {
@@ -584,16 +568,16 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    //@}

    using Base::setConstant;
-    EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& value);
-    EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& value);
+    Derived& setConstant(Index size, const Scalar& value);
+    Derived& setConstant(Index rows, Index cols, const Scalar& value);

    using Base::setZero;
-    EIGEN_DEVICE_FUNC Derived& setZero(Index size);
-    EIGEN_DEVICE_FUNC Derived& setZero(Index rows, Index cols);
+    Derived& setZero(Index size);
+    Derived& setZero(Index rows, Index cols);

    using Base::setOnes;
-    EIGEN_DEVICE_FUNC Derived& setOnes(Index size);
-    EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, Index cols);
+    Derived& setOnes(Index size);
+    Derived& setOnes(Index rows, Index cols);

    using Base::setRandom;
    Derived& setRandom(Index size);
@@ -612,7 +596,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * remain row-vectors and vectors remain vectors.
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE void _resize_to_match(const EigenBase<OtherDerived>& other)
    {
      #ifdef EIGEN_NO_AUTOMATIC_RESIZING
@@ -620,6 +603,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
                 : (rows() == other.rows() && cols() == other.cols())))
        && "Size mismatch. Automatic resizing is disabled because EIGEN_NO_AUTOMATIC_RESIZING is defined");
      EIGEN_ONLY_USED_FOR_DEBUG(other);
+      if(this->size()==0)
+        resizeLike(other);
      #else
      resizeLike(other);
      #endif
@@ -640,7 +625,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * \internal
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE Derived& _set(const DenseBase<OtherDerived>& other)
    {
      _set_selector(other.derived(), typename internal::conditional<static_cast<bool>(int(OtherDerived::Flags) & EvalBeforeAssigningBit), internal::true_type, internal::false_type>::type());
@@ -648,11 +632,9 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    }

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE void _set_selector(const OtherDerived& other, const internal::true_type&) { _set_noalias(other.eval()); }

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE void _set_selector(const OtherDerived& other, const internal::false_type&) { _set_noalias(other); }

    /** \internal Like _set() but additionally makes the assumption that no aliasing effect can happen (which
@@ -661,7 +643,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * \sa operator=(const MatrixBase<OtherDerived>&), _set()
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE Derived& _set_noalias(const DenseBase<OtherDerived>& other)
    {
      // I don't think we need this resize call since the lazyAssign will anyways resize
@@ -673,7 +654,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    }

    template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init2(Index nbRows, Index nbCols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
    {
      EIGEN_STATIC_ASSERT(bool(NumTraits<T0>::IsInteger) &&
@@ -681,91 +661,13 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
      resize(nbRows,nbCols);
    }
-    
    template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE void _init2(const Scalar& val0, const Scalar& val1, typename internal::enable_if<Base::SizeAtCompileTime==2,T0>::type* = 0)
    {
      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
      m_storage.data()[0] = val0;
      m_storage.data()[1] = val1;
    }
-    
-    template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC 
-    EIGEN_STRONG_INLINE void _init2(const Index& val0, const Index& val1,
-                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
-                                                                  && (internal::is_same<T0,Index>::value)
-                                                                  && (internal::is_same<T1,Index>::value)
-                                                                  && Base::SizeAtCompileTime==2,T1>::type* = 0)
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
-      m_storage.data()[0] = Scalar(val0);
-      m_storage.data()[1] = Scalar(val1);
-    }
-
-    template<typename T>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init1(Index size, typename internal::enable_if<Base::SizeAtCompileTime!=1 || !internal::is_convertible<T, Scalar>::value,T>::type* = 0)
-    {
-      // NOTE MSVC 2008 complains if we directly put bool(NumTraits<T>::IsInteger) as the EIGEN_STATIC_ASSERT argument.
-      const bool is_integer = NumTraits<T>::IsInteger;
-      EIGEN_STATIC_ASSERT(is_integer,
-                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
-      resize(size);
-    }
-    template<typename T>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if<Base::SizeAtCompileTime==1 && internal::is_convertible<T, Scalar>::value,T>::type* = 0)
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
-      m_storage.data()[0] = val0;
-    }
-    
-    template<typename T>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init1(const Index& val0,
-                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
-                                                                  && (internal::is_same<Index,T>::value)
-                                                                  && Base::SizeAtCompileTime==1
-                                                                  && internal::is_convertible<T, Scalar>::value,T*>::type* = 0)
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
-      m_storage.data()[0] = Scalar(val0);
-    }
-
-    template<typename T>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init1(const Scalar* data){
-      this->_set_noalias(ConstMapType(data));
-    }
-
-    template<typename T, typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init1(const DenseBase<OtherDerived>& other){
-      this->_set_noalias(other);
-    }
-
-    template<typename T, typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init1(const EigenBase<OtherDerived>& other){
-      this->derived() = other;
-    }
-
-    template<typename T, typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init1(const ReturnByValue<OtherDerived>& other)
-    {
-      resize(other.rows(), other.cols());
-      other.evalTo(this->derived());
-    }
-
-    template<typename T, typename OtherDerived, int ColsAtCompileTime>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init1(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
-    {
-      this->derived() = r;
-    }

    template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
    friend struct internal::matrix_swap_impl;
@@ -774,7 +676,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * data pointers.
      */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    void _swap(DenseBase<OtherDerived> const & other)
    {
      enum { SwapPointers = internal::is_same<Derived, OtherDerived>::value && Base::SizeAtCompileTime==Dynamic };
@@ -783,7 +684,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type

  public:
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-    EIGEN_DEVICE_FUNC 
    static EIGEN_STRONG_INLINE void _check_template_params()
    {
      EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (Options&RowMajor)==RowMajor)
@@ -900,7 +800,6 @@ struct conservative_resize_like_impl<Derived,OtherDerived,true>
 template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
 struct matrix_swap_impl
 {
-  EIGEN_DEVICE_FUNC
  static inline void run(MatrixTypeA& a, MatrixTypeB& b)
  {
    a.base().swap(b);
@@ -910,7 +809,6 @@ struct matrix_swap_impl
 template<typename MatrixTypeA, typename MatrixTypeB>
 struct matrix_swap_impl<MatrixTypeA, MatrixTypeB, true>
 {
-  EIGEN_DEVICE_FUNC
  static inline void run(MatrixTypeA& a, MatrixTypeB& b)
  {
    static_cast<typename MatrixTypeA::Base&>(a).m_storage.swap(static_cast<typename MatrixTypeB::Base&>(b).m_storage);
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -1,107 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PRODUCT_H
-#define EIGEN_PRODUCT_H
-
-namespace Eigen {
-
-template<typename Lhs, typename Rhs> class Product;
-template<typename Lhs, typename Rhs, typename StorageKind> class ProductImpl;
-
-/** \class Product
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the product of two arbitrary matrices or vectors
-  *
-  * \param Lhs the type of the left-hand side expression
-  * \param Rhs the type of the right-hand side expression
-  *
-  * This class represents an expression of the product of two arbitrary matrices.
-  *
-  */
-
-// Use ProductReturnType to get correct traits, in particular vectorization flags
-namespace internal {
-template<typename Lhs, typename Rhs>
-struct traits<Product<Lhs, Rhs> >
-  : traits<typename ProductReturnType<Lhs, Rhs>::Type>
-{ 
-  // We want A+B*C to be of type Product<Matrix, Sum> and not Product<Matrix, Matrix>
-  // TODO: This flag should eventually go in a separate evaluator traits class
-  enum {
-    Flags = traits<typename ProductReturnType<Lhs, Rhs>::Type>::Flags & ~(EvalBeforeNestingBit | DirectAccessBit)
-  };
-};
-} // end namespace internal
-
-
-template<typename Lhs, typename Rhs>
-class Product : public ProductImpl<Lhs,Rhs,typename internal::promote_storage_type<typename internal::traits<Lhs>::StorageKind,
-                                                                            typename internal::traits<Rhs>::StorageKind>::ret>
-{
-  public:
-    
-    typedef typename ProductImpl<
-        Lhs, Rhs,
-        typename internal::promote_storage_type<typename Lhs::StorageKind,
-                                                typename Rhs::StorageKind>::ret>::Base Base;
-    EIGEN_GENERIC_PUBLIC_INTERFACE(Product)
-
-    typedef typename Lhs::Nested LhsNested;
-    typedef typename Rhs::Nested RhsNested;
-    typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
-    typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
-
-    Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs)
-    {
-      eigen_assert(lhs.cols() == rhs.rows()
-        && "invalid matrix product"
-        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
-    }
-
-    inline Index rows() const { return m_lhs.rows(); }
-    inline Index cols() const { return m_rhs.cols(); }
-
-    const LhsNestedCleaned& lhs() const { return m_lhs; }
-    const RhsNestedCleaned& rhs() const { return m_rhs; }
-
-  protected:
-
-    LhsNested m_lhs;
-    RhsNested m_rhs;
-};
-
-template<typename Lhs, typename Rhs>
-class ProductImpl<Lhs,Rhs,Dense> : public internal::dense_xpr_base<Product<Lhs,Rhs> >::type
-{
-    typedef Product<Lhs, Rhs> Derived;
-  public:
-
-    typedef typename internal::dense_xpr_base<Product<Lhs, Rhs> >::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
-};
-
-/***************************************************************************
-* Implementation of matrix base methods
-***************************************************************************/
-
-
-/** \internal used to test the evaluator only
-  */
-template<typename Lhs,typename Rhs>
-const Product<Lhs,Rhs>
-prod(const Lhs& lhs, const Rhs& rhs)
-{
-  return Product<Lhs,Rhs>(lhs,rhs);
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_PRODUCT_H
--- a/Eigen/src/Core/ProductBase.h
+++ b/Eigen/src/Core/ProductBase.h
@@ -85,7 +85,14 @@ class ProductBase : public MatrixBase<Derived>

  public:

+#ifndef EIGEN_NO_MALLOC
+    typedef typename Base::PlainObject BasePlainObject;
+    typedef Matrix<Scalar,RowsAtCompileTime==1?1:Dynamic,ColsAtCompileTime==1?1:Dynamic,BasePlainObject::Options> DynPlainObject;
+    typedef typename internal::conditional<(BasePlainObject::SizeAtCompileTime==Dynamic) || (BasePlainObject::SizeAtCompileTime*int(sizeof(Scalar)) < int(EIGEN_STACK_ALLOCATION_LIMIT)),
+                                           BasePlainObject, DynPlainObject>::type PlainObject;
+#else
    typedef typename Base::PlainObject PlainObject;
+#endif

    ProductBase(const Lhs& a_lhs, const Rhs& a_rhs)
      : m_lhs(a_lhs), m_rhs(a_rhs)
@@ -131,13 +138,17 @@ class ProductBase : public MatrixBase<Derived>
    const Diagonal<FullyLazyCoeffBaseProductType,Dynamic> diagonal(Index index) const
    { return FullyLazyCoeffBaseProductType(m_lhs, m_rhs).diagonal(index); }

-    // restrict coeff accessors to 1x1 expressions. No need to care about mutators here since this isn't an Lvalue expression
+    // restrict coeff accessors to 1x1 expressions. No need to care about mutators here since this isnt a Lvalue expression
    typename Base::CoeffReturnType coeff(Index row, Index col) const
    {
+#ifdef EIGEN2_SUPPORT
+      return lhs().row(row).cwiseProduct(rhs().col(col).transpose()).sum();
+#else
      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
      eigen_assert(this->rows() == 1 && this->cols() == 1);
      Matrix<Scalar,1,1> result = *this;
      return result.coeff(row,col);
+#endif
    }

    typename Base::CoeffReturnType coeff(Index i) const
@@ -176,7 +187,12 @@ namespace internal {
 template<typename Lhs, typename Rhs, int Mode, int N, typename PlainObject>
 struct nested<GeneralProduct<Lhs,Rhs,Mode>, N, PlainObject>
 {
-  typedef PlainObject const& type;
+  typedef typename GeneralProduct<Lhs,Rhs,Mode>::PlainObject const& type;
+};
+template<typename Lhs, typename Rhs, int Mode, int N, typename PlainObject>
+struct nested<const GeneralProduct<Lhs,Rhs,Mode>, N, PlainObject>
+{
+  typedef typename GeneralProduct<Lhs,Rhs,Mode>::PlainObject const& type;
 };
 }

--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -1,411 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2011 Jitse Niesen <jitse@maths.leeds.ac.uk>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-
-#ifndef EIGEN_PRODUCTEVALUATORS_H
-#define EIGEN_PRODUCTEVALUATORS_H
-
-namespace Eigen {
-  
-namespace internal {
-  
-// We can evaluate the product either all at once, like GeneralProduct and its evalTo() function, or
-// traverse the matrix coefficient by coefficient, like CoeffBasedProduct.  Use the existing logic
-// in ProductReturnType to decide.
-
-template<typename XprType, typename ProductType>
-struct product_evaluator_dispatcher;
-
-template<typename Lhs, typename Rhs>
-struct evaluator_impl<Product<Lhs, Rhs> >
-  : product_evaluator_dispatcher<Product<Lhs, Rhs>, typename ProductReturnType<Lhs, Rhs>::Type> 
-{
-  typedef Product<Lhs, Rhs> XprType;
-  typedef product_evaluator_dispatcher<XprType, typename ProductReturnType<Lhs, Rhs>::Type> Base;
-
-  evaluator_impl(const XprType& xpr) : Base(xpr) 
-  { }
-};
-
-template<typename XprType, typename ProductType>
-struct product_evaluator_traits_dispatcher;
-
-template<typename Lhs, typename Rhs>
-struct evaluator_traits<Product<Lhs, Rhs> >
-  : product_evaluator_traits_dispatcher<Product<Lhs, Rhs>, typename ProductReturnType<Lhs, Rhs>::Type> 
-{ 
-  static const int AssumeAliasing = 1;
-};
-
-// Case 1: Evaluate all at once
-//
-// We can view the GeneralProduct class as a part of the product evaluator. 
-// Four sub-cases: InnerProduct, OuterProduct, GemmProduct and GemvProduct.
-// InnerProduct is special because GeneralProduct does not have an evalTo() method in this case.
-
-template<typename Lhs, typename Rhs>
-struct product_evaluator_traits_dispatcher<Product<Lhs, Rhs>, GeneralProduct<Lhs, Rhs, InnerProduct> > 
-{
-  static const int HasEvalTo = 0;
-};
-
-template<typename Lhs, typename Rhs>
-struct product_evaluator_dispatcher<Product<Lhs, Rhs>, GeneralProduct<Lhs, Rhs, InnerProduct> > 
-  : public evaluator<typename Product<Lhs, Rhs>::PlainObject>::type
-{
-  typedef Product<Lhs, Rhs> XprType;
-  typedef typename XprType::PlainObject PlainObject;
-  typedef typename evaluator<PlainObject>::type evaluator_base;
-
-  // TODO: Computation is too early (?)
-  product_evaluator_dispatcher(const XprType& xpr) : evaluator_base(m_result)
-  {
-    m_result.coeffRef(0,0) = (xpr.lhs().transpose().cwiseProduct(xpr.rhs())).sum();
-  }
-  
-protected:  
-  PlainObject m_result;
-};
-
-// For the other three subcases, simply call the evalTo() method of GeneralProduct
-// TODO: GeneralProduct should take evaluators, not expression objects.
-
-template<typename Lhs, typename Rhs, int ProductType>
-struct product_evaluator_traits_dispatcher<Product<Lhs, Rhs>, GeneralProduct<Lhs, Rhs, ProductType> > 
-{
-  static const int HasEvalTo = 1;
-};
-
-template<typename Lhs, typename Rhs, int ProductType>
-struct product_evaluator_dispatcher<Product<Lhs, Rhs>, GeneralProduct<Lhs, Rhs, ProductType> > 
-{
-  typedef Product<Lhs, Rhs> XprType;
-  typedef typename XprType::PlainObject PlainObject;
-  typedef typename evaluator<PlainObject>::type evaluator_base;
-  
-  product_evaluator_dispatcher(const XprType& xpr) : m_xpr(xpr)
-  { }
-  
-  template<typename DstEvaluatorType, typename DstXprType>
-  void evalTo(DstEvaluatorType /* not used */, DstXprType& dst) const
-  {
-    dst.resize(m_xpr.rows(), m_xpr.cols());
-    GeneralProduct<Lhs, Rhs, ProductType>(m_xpr.lhs(), m_xpr.rhs()).evalTo(dst);
-  }
-  
-protected: 
-  const XprType& m_xpr;
-};
-
-// Case 2: Evaluate coeff by coeff
-//
-// This is mostly taken from CoeffBasedProduct.h
-// The main difference is that we add an extra argument to the etor_product_*_impl::run() function
-// for the inner dimension of the product, because evaluator object do not know their size.
-
-template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
-struct etor_product_coeff_impl;
-
-template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl;
-
-template<typename Lhs, typename Rhs, typename LhsNested, typename RhsNested, int Flags>
-struct product_evaluator_traits_dispatcher<Product<Lhs, Rhs>, CoeffBasedProduct<LhsNested, RhsNested, Flags> >
-{
-  static const int HasEvalTo = 0;
-};
-
-template<typename Lhs, typename Rhs, typename LhsNested, typename RhsNested, int Flags>
-struct product_evaluator_dispatcher<Product<Lhs, Rhs>, CoeffBasedProduct<LhsNested, RhsNested, Flags> >
-  : evaluator_impl_base<Product<Lhs, Rhs> >
-{
-  typedef Product<Lhs, Rhs> XprType;
-  typedef CoeffBasedProduct<LhsNested, RhsNested, Flags> CoeffBasedProductType;
-
-  product_evaluator_dispatcher(const XprType& xpr) 
-    : m_lhsImpl(xpr.lhs()), 
-      m_rhsImpl(xpr.rhs()),  
-      m_innerDim(xpr.lhs().cols())
-  { }
-
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketScalar PacketScalar;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-
-  // Everything below here is taken from CoeffBasedProduct.h
-
-  enum {
-    RowsAtCompileTime = traits<CoeffBasedProductType>::RowsAtCompileTime,
-    PacketSize = packet_traits<Scalar>::size,
-    InnerSize  = traits<CoeffBasedProductType>::InnerSize,
-    CoeffReadCost = traits<CoeffBasedProductType>::CoeffReadCost,
-    Unroll = CoeffReadCost != Dynamic && CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
-    CanVectorizeInner = traits<CoeffBasedProductType>::CanVectorizeInner
-  };
-
-  typedef typename evaluator<Lhs>::type LhsEtorType;
-  typedef typename evaluator<Rhs>::type RhsEtorType;
-  typedef etor_product_coeff_impl<CanVectorizeInner ? InnerVectorizedTraversal : DefaultTraversal,
-                                  Unroll ? InnerSize-1 : Dynamic,
-                                  LhsEtorType, RhsEtorType, Scalar> CoeffImpl;
-
-  const CoeffReturnType coeff(Index row, Index col) const
-  {
-    Scalar res;
-    CoeffImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
-    return res;
-  }
-
-  /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
-   * which is why we don't set the LinearAccessBit.
-   */
-  const CoeffReturnType coeff(Index index) const
-  {
-    Scalar res;
-    const Index row = RowsAtCompileTime == 1 ? 0 : index;
-    const Index col = RowsAtCompileTime == 1 ? index : 0;
-    CoeffImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
-    return res;
-  }
-
-  template<int LoadMode>
-  const PacketReturnType packet(Index row, Index col) const
-  {
-    PacketScalar res;
-    typedef etor_product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
-				     Unroll ? InnerSize-1 : Dynamic,
-				     LhsEtorType, RhsEtorType, PacketScalar, LoadMode> PacketImpl;
-    PacketImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
-    return res;
-  }
-
-protected:
-  typename evaluator<Lhs>::type m_lhsImpl;
-  typename evaluator<Rhs>::type m_rhsImpl;
-
-  // TODO: Get rid of m_innerDim if known at compile time
-  Index m_innerDim;
-};
-
-/***************************************************************************
-* Normal product .coeff() implementation (with meta-unrolling)
-***************************************************************************/
-
-/**************************************
-*** Scalar path  - no vectorization ***
-**************************************/
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
-struct etor_product_coeff_impl<DefaultTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, RetScalar &res)
-  {
-    etor_product_coeff_impl<DefaultTraversal, UnrollingIndex-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, innerDim, res);
-    res += lhs.coeff(row, UnrollingIndex) * rhs.coeff(UnrollingIndex, col);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct etor_product_coeff_impl<DefaultTraversal, 0, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, RetScalar &res)
-  {
-    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct etor_product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, RetScalar& res)
-  {
-    eigen_assert(innerDim>0 && "you are using a non initialized matrix");
-    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
-    for(Index i = 1; i < innerDim; ++i)
-      res += lhs.coeff(row, i) * rhs.coeff(i, col);
-  }
-};
-
-/*******************************************
-*** Scalar path with inner vectorization ***
-*******************************************/
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet>
-struct etor_product_coeff_vectorized_unroller
-{
-  typedef typename Lhs::Index Index;
-  enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, typename Lhs::PacketScalar &pres)
-  {
-    etor_product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, innerDim, pres);
-    pres = padd(pres, pmul( lhs.template packet<Aligned>(row, UnrollingIndex) , rhs.template packet<Aligned>(UnrollingIndex, col) ));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet>
-struct etor_product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, typename Lhs::PacketScalar &pres)
-  {
-    pres = pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col));
-  }
-};
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
-struct etor_product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::PacketScalar Packet;
-  typedef typename Lhs::Index Index;
-  enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, RetScalar &res)
-  {
-    Packet pres;
-    etor_product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, innerDim, pres);
-    etor_product_coeff_impl<DefaultTraversal,UnrollingIndex,Lhs,Rhs,RetScalar>::run(row, col, lhs, rhs, innerDim, res);
-    res = predux(pres);
-  }
-};
-
-template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
-struct etor_product_coeff_vectorized_dyn_selector
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, typename Lhs::Scalar &res)
-  {
-    res = lhs.row(row).transpose().cwiseProduct(rhs.col(col)).sum();
-  }
-};
-
-// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
-// NOTE maybe they are now useless since we have a specialization for Block<Matrix>
-template<typename Lhs, typename Rhs, int RhsCols>
-struct etor_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, typename Lhs::Scalar &res)
-  {
-    res = lhs.transpose().cwiseProduct(rhs.col(col)).sum();
-  }
-};
-
-template<typename Lhs, typename Rhs, int LhsRows>
-struct etor_product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index /*col*/, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, typename Lhs::Scalar &res)
-  {
-    res = lhs.row(row).transpose().cwiseProduct(rhs).sum();
-  }
-};
-
-template<typename Lhs, typename Rhs>
-struct etor_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
-{
-  typedef typename Lhs::Index Index;
-  EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, typename Lhs::Scalar &res)
-  {
-    res = lhs.transpose().cwiseProduct(rhs).sum();
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct etor_product_coeff_impl<InnerVectorizedTraversal, Dynamic, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, typename Lhs::Scalar &res)
-  {
-    etor_product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, innerDim, res);
-  }
-};
-
-/*******************
-*** Packet path  ***
-*******************/
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
-  {
-    etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
-  }
-};
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
-  {
-    etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
-  {
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
-  {
-    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
-  {
-    eigen_assert(innerDim>0 && "you are using a non initialized matrix");
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
-    for(Index i = 1; i < innerDim; ++i)
-      res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
-  {
-    eigen_assert(innerDim>0 && "you are using a non initialized matrix");
-    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
-    for(Index i = 1; i < innerDim; ++i)
-      res =  pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
-  }
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_PRODUCT_EVALUATORS_H
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h
@@ -28,18 +28,12 @@ struct functor_traits<scalar_random_op<Scalar> >

 /** \returns a random matrix expression
  *
-  * Numbers are uniformly spread through their whole definition range for integer types,
-  * and in the [-1:1] range for floating point scalar types.
-  * 
  * The parameters \a rows and \a cols are the number of rows and of columns of
  * the returned matrix. Must be compatible with this MatrixBase type.
  *
-  * \not_reentrant
-  * 
  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
  * it is redundant to pass \a rows and \a cols as arguments, so Random() should be used
  * instead.
-  * 
  *
  * Example: \include MatrixBase_random_int_int.cpp
  * Output: \verbinclude MatrixBase_random_int_int.out
@@ -47,10 +41,8 @@ struct functor_traits<scalar_random_op<Scalar> >
  * This expression has the "evaluate before nesting" flag so that it will be evaluated into
  * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
  * behavior with expressions involving random matrices.
-  * 
-  * See DenseBase::NullaryExpr(Index, const CustomNullaryOp&) for an example using C++11 random generators.
  *
-  * \sa DenseBase::setRandom(), DenseBase::Random(Index), DenseBase::Random()
+  * \sa MatrixBase::setRandom(), MatrixBase::Random(Index), MatrixBase::Random()
  */
 template<typename Derived>
 inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
@@ -60,15 +52,11 @@ DenseBase<Derived>::Random(Index rows, Index cols)
 }

 /** \returns a random vector expression
-  *
-  * Numbers are uniformly spread through their whole definition range for integer types,
-  * and in the [-1:1] range for floating point scalar types.
  *
  * The parameter \a size is the size of the returned vector.
  * Must be compatible with this MatrixBase type.
  *
  * \only_for_vectors
-  * \not_reentrant
  *
  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
  * it is redundant to pass \a size as argument, so Random() should be used
@@ -81,7 +69,7 @@ DenseBase<Derived>::Random(Index rows, Index cols)
  * a temporary vector whenever it is nested in a larger expression. This prevents unexpected
  * behavior with expressions involving random matrices.
  *
-  * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random()
+  * \sa MatrixBase::setRandom(), MatrixBase::Random(Index,Index), MatrixBase::Random()
  */
 template<typename Derived>
 inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
@@ -92,9 +80,6 @@ DenseBase<Derived>::Random(Index size)

 /** \returns a fixed-size random matrix or vector expression
  *
-  * Numbers are uniformly spread through their whole definition range for integer types,
-  * and in the [-1:1] range for floating point scalar types.
-  * 
  * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
  * need to use the variants taking size arguments.
  *
@@ -104,10 +89,8 @@ DenseBase<Derived>::Random(Index size)
  * This expression has the "evaluate before nesting" flag so that it will be evaluated into
  * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
  * behavior with expressions involving random matrices.
-  * 
-  * \not_reentrant
  *
-  * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random(Index)
+  * \sa MatrixBase::setRandom(), MatrixBase::Random(Index,Index), MatrixBase::Random(Index)
  */
 template<typename Derived>
 inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
@@ -118,11 +101,6 @@ DenseBase<Derived>::Random()

 /** Sets all coefficients in this expression to random values.
  *
-  * Numbers are uniformly spread through their whole definition range for integer types,
-  * and in the [-1:1] range for floating point scalar types.
-  * 
-  * \not_reentrant
-  * 
  * Example: \include MatrixBase_setRandom.cpp
  * Output: \verbinclude MatrixBase_setRandom.out
  *
@@ -136,16 +114,12 @@ inline Derived& DenseBase<Derived>::setRandom()

 /** Resizes to the given \a newSize, and sets all coefficients in this expression to random values.
  *
-  * Numbers are uniformly spread through their whole definition range for integer types,
-  * and in the [-1:1] range for floating point scalar types.
-  * 
  * \only_for_vectors
-  * \not_reentrant
  *
  * Example: \include Matrix_setRandom_int.cpp
  * Output: \verbinclude Matrix_setRandom_int.out
  *
-  * \sa DenseBase::setRandom(), setRandom(Index,Index), class CwiseNullaryOp, DenseBase::Random()
+  * \sa MatrixBase::setRandom(), setRandom(Index,Index), class CwiseNullaryOp, MatrixBase::Random()
  */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
@@ -157,18 +131,13 @@ PlainObjectBase<Derived>::setRandom(Index newSize)

 /** Resizes to the given size, and sets all coefficients in this expression to random values.
  *
-  * Numbers are uniformly spread through their whole definition range for integer types,
-  * and in the [-1:1] range for floating point scalar types.
-  *
-  * \not_reentrant
-  * 
  * \param nbRows the new number of rows
  * \param nbCols the new number of columns
  *
  * Example: \include Matrix_setRandom_int_int.cpp
  * Output: \verbinclude Matrix_setRandom_int_int.out
  *
-  * \sa DenseBase::setRandom(), setRandom(Index), class CwiseNullaryOp, DenseBase::Random()
+  * \sa MatrixBase::setRandom(), setRandom(Index), class CwiseNullaryOp, MatrixBase::Random()
  */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -82,7 +82,6 @@ struct redux_novec_unroller

  typedef typename Derived::Scalar Scalar;

-  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
  {
    return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
@@ -100,7 +99,6 @@ struct redux_novec_unroller<Func, Derived, Start, 1>

  typedef typename Derived::Scalar Scalar;

-  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&)
  {
    return mat.coeffByOuterInner(outer, inner);
@@ -114,7 +112,6 @@ template<typename Func, typename Derived, int Start>
 struct redux_novec_unroller<Func, Derived, Start, 0>
 {
  typedef typename Derived::Scalar Scalar;
-  EIGEN_DEVICE_FUNC 
  static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
 };

@@ -173,7 +170,6 @@ struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
 {
  typedef typename Derived::Scalar Scalar;
  typedef typename Derived::Index Index;
-  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func)
  {
    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
@@ -207,7 +203,7 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
    const Index packetSize = packet_traits<Scalar>::size;
    const Index alignedStart = internal::first_aligned(mat);
    enum {
-      alignment = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) || bool(Derived::Flags & AlignedBit)
+      alignment = bool(Derived::Flags & DirectAccessBit) || bool(Derived::Flags & AlignedBit)
                ? Aligned : Unaligned
    };
    const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
@@ -251,8 +247,9 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
  }
 };

-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, SliceVectorizedTraversal, NoUnrolling>
+// NOTE: for SliceVectorizedTraversal we simply bypass unrolling
+template<typename Func, typename Derived, int Unrolling>
+struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling>
 {
  typedef typename Derived::Scalar Scalar;
  typedef typename packet_traits<Scalar>::type PacketScalar;
@@ -303,15 +300,10 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
  static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func)
  {
    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
-    if (VectorizedSize > 0) {
-      Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
-      if (VectorizedSize != Size)
-        res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
-      return res;
-    }
-    else {
-      return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
-    }
+    Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
+    if (VectorizedSize != Size)
+      res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
+    return res;
  }
 };

--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -19,17 +19,17 @@ template<typename PlainObjectType, int Options = 0,
 /** \class Ref
  * \ingroup Core_Module
  *
-  * \brief A matrix or vector expression mapping an existing expression
+  * \brief A matrix or vector expression mapping an existing expressions
  *
  * \tparam PlainObjectType the equivalent matrix type of the mapped data
  * \tparam Options specifies whether the pointer is \c #Aligned, or \c #Unaligned.
  *                The default is \c #Unaligned.
  * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1),
-  *                   but accepts a variable outer stride (leading dimension).
+  *                   but accept a variable outer stride (leading dimension).
  *                   This can be overridden by specifying strides.
  *                   The type passed here must be a specialization of the Stride template, see examples below.
  *
-  * This class provides a way to write non-template functions taking Eigen objects as parameters while limiting the number of copies.
+  * This class permits to write non template functions taking Eigen's object as parameters while limiting the number of copies.
  * A Ref<> object can represent either a const expression or a l-value:
  * \code
  * // in-out argument:
@@ -39,10 +39,10 @@ template<typename PlainObjectType, int Options = 0,
  * void foo2(const Ref<const VectorXf>& x);
  * \endcode
  *
-  * In the in-out case, the input argument must satisfy the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered.
+  * In the in-out case, the input argument must satisfies the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered.
  * By default, a Ref<VectorXf> can reference any dense vector expression of float having a contiguous memory layout.
-  * Likewise, a Ref<MatrixXf> can reference any column-major dense matrix expression of float whose column's elements are contiguously stored with
-  * the possibility to have a constant space in-between each column, i.e. the inner stride must be equal to 1, but the outer stride (or leading dimension)
+  * Likewise, a Ref<MatrixXf> can reference any column major dense matrix expression of float whose column's elements are contiguously stored with
+  * the possibility to have a constant space inbetween each column, i.e.: the inner stride mmust be equal to 1, but the outer-stride (or leading dimension),
  * can be greater than the number of rows.
  *
  * In the const case, if the input expression does not match the above requirement, then it is evaluated into a temporary before being passed to the function.
@@ -58,15 +58,15 @@ template<typename PlainObjectType, int Options = 0,
  * foo2(A.col().segment(2,4)); // No temporary
  * \endcode
  *
-  * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameters.
+  * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameter.
  * Here is an example accepting an innerstride!=1:
  * \code
  * // in-out argument:
  * void foo3(Ref<VectorXf,0,InnerStride<> > x);
  * foo3(A.row());              // OK
  * \endcode
-  * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involve more
-  * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overload internally calling a
+  * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involved more
+  * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overloads internally calling a
  * template function, e.g.:
  * \code
  * // in the .h:
@@ -108,7 +108,8 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
      OuterStrideMatch = Derived::IsVectorAtCompileTime
                      || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
      AlignmentMatch = (_Options!=Aligned) || ((PlainObjectType::Flags&AlignedBit)==0) || ((traits<Derived>::Flags&AlignedBit)==AlignedBit),
-      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch
+      ScalarTypeMatch = internal::is_same<typename PlainObjectType::Scalar, typename Derived::Scalar>::value,
+      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch && ScalarTypeMatch
    };
    typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
  };
@@ -187,7 +188,11 @@ protected:
 template<typename PlainObjectType, int Options, typename StrideType> class Ref
  : public RefBase<Ref<PlainObjectType, Options, StrideType> >
 {
+  private:
    typedef internal::traits<Ref> Traits;
+    template<typename Derived>
+    inline Ref(const PlainObjectBase<Derived>& expr,
+               typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0);
  public:

    typedef RefBase<Ref> Base;
@@ -199,17 +204,20 @@ template<typename PlainObjectType, int Options, typename StrideType> class Ref
    inline Ref(PlainObjectBase<Derived>& expr,
               typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
    {
-      Base::construct(expr);
+      EIGEN_STATIC_ASSERT(static_cast<bool>(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      Base::construct(expr.derived());
    }
    template<typename Derived>
    inline Ref(const DenseBase<Derived>& expr,
-               typename internal::enable_if<bool(internal::is_lvalue<Derived>::value&&bool(Traits::template match<Derived>::MatchAtCompileTime)),Derived>::type* = 0,
-               int = Derived::ThisConstantIsPrivateInPlainObjectBase)
+               typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
    #else
    template<typename Derived>
    inline Ref(DenseBase<Derived>& expr)
    #endif
    {
+      EIGEN_STATIC_ASSERT(static_cast<bool>(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      EIGEN_STATIC_ASSERT(static_cast<bool>(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      enum { THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY = Derived::ThisConstantIsPrivateInPlainObjectBase};
      Base::construct(expr.const_cast_derived());
    }

@@ -228,13 +236,23 @@ template<typename TPlainObjectType, int Options, typename StrideType> class Ref<
    EIGEN_DENSE_PUBLIC_INTERFACE(Ref)

    template<typename Derived>
-    inline Ref(const DenseBase<Derived>& expr)
+    inline Ref(const DenseBase<Derived>& expr,
+               typename internal::enable_if<bool(Traits::template match<Derived>::ScalarTypeMatch),Derived>::type* = 0)
    {
 //      std::cout << match_helper<Derived>::HasDirectAccess << "," << match_helper<Derived>::OuterStrideMatch << "," << match_helper<Derived>::InnerStrideMatch << "\n";
 //      std::cout << int(StrideType::OuterStrideAtCompileTime) << " - " << int(Derived::OuterStrideAtCompileTime) << "\n";
 //      std::cout << int(StrideType::InnerStrideAtCompileTime) << " - " << int(Derived::InnerStrideAtCompileTime) << "\n";
      construct(expr.derived(), typename Traits::template match<Derived>::type());
    }
+    
+    inline Ref(const Ref& other) : Base(other) {
+      // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
+    }
+
+    template<typename OtherRef>
+    inline Ref(const RefBase<OtherRef>& other) {
+      construct(other.derived(), typename Traits::template match<OtherRef>::type());
+    }

  protected:

--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h
@@ -135,7 +135,7 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
  */
 template<typename Derived>
 template<int RowFactor, int ColFactor>
-inline const Replicate<Derived,RowFactor,ColFactor>
+const Replicate<Derived,RowFactor,ColFactor>
 DenseBase<Derived>::replicate() const
 {
  return Replicate<Derived,RowFactor,ColFactor>(derived());
@@ -150,7 +150,7 @@ DenseBase<Derived>::replicate() const
  * \sa VectorwiseOp::replicate(), DenseBase::replicate<int,int>(), class Replicate
  */
 template<typename Derived>
-inline const Replicate<Derived,Dynamic,Dynamic>
+const typename DenseBase<Derived>::ReplicateReturnType
 DenseBase<Derived>::replicate(Index rowFactor,Index colFactor) const
 {
  return Replicate<Derived,Dynamic,Dynamic>(derived(),rowFactor,colFactor);
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@@ -57,11 +57,10 @@ template<typename Derived> class ReturnByValue
    EIGEN_DENSE_PUBLIC_INTERFACE(ReturnByValue)

    template<typename Dest>
-    EIGEN_DEVICE_FUNC
    inline void evalTo(Dest& dst) const
    { static_cast<const Derived*>(this)->evalTo(dst); }
-    EIGEN_DEVICE_FUNC inline Index rows() const { return static_cast<const Derived*>(this)->rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return static_cast<const Derived*>(this)->cols(); }
+    inline Index rows() const { return static_cast<const Derived*>(this)->rows(); }
+    inline Index cols() const { return static_cast<const Derived*>(this)->cols(); }

 #ifndef EIGEN_PARSED_BY_DOXYGEN
 #define Unusable YOU_ARE_TRYING_TO_ACCESS_A_SINGLE_COEFFICIENT_IN_A_SPECIAL_EXPRESSION_WHERE_THAT_IS_NOT_ALLOWED_BECAUSE_THAT_WOULD_BE_INEFFICIENT
@@ -73,6 +72,8 @@ template<typename Derived> class ReturnByValue
    const Unusable& coeff(Index,Index) const { return *reinterpret_cast<const Unusable*>(this); }
    Unusable& coeffRef(Index) { return *reinterpret_cast<Unusable*>(this); }
    Unusable& coeffRef(Index,Index) { return *reinterpret_cast<Unusable*>(this); }
+    template<int LoadMode>  Unusable& packet(Index) const;
+    template<int LoadMode>  Unusable& packet(Index, Index) const;
 #endif
 };

@@ -84,6 +85,15 @@ Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
  return derived();
 }

+template<typename Derived>
+template<typename OtherDerived>
+Derived& DenseBase<Derived>::lazyAssign(const ReturnByValue<OtherDerived>& other)
+{
+  other.evalTo(derived());
+  return derived();
+}
+
+
 } // end namespace Eigen

 #endif // EIGEN_RETURNBYVALUE_H
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -69,23 +69,17 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
    };
    typedef typename MatrixType::PlainObject PlainObject;

-    EIGEN_DEVICE_FUNC
    inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
    {}

-    EIGEN_DEVICE_FUNC
    inline Index rows() const { return m_matrix.rows(); }
-    EIGEN_DEVICE_FUNC
    inline Index cols() const { return m_matrix.cols(); }
-    EIGEN_DEVICE_FUNC
    inline Index outerStride() const { return m_matrix.outerStride(); }
-    EIGEN_DEVICE_FUNC
    inline Index innerStride() const { return m_matrix.innerStride(); }

    /** \sa MatrixBase::coeff()
      * \warning the coordinates must fit into the referenced triangular part
      */
-    EIGEN_DEVICE_FUNC
    inline Scalar coeff(Index row, Index col) const
    {
      Base::check_coordinates_internal(row, col);
@@ -95,7 +89,6 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
    /** \sa MatrixBase::coeffRef()
      * \warning the coordinates must fit into the referenced triangular part
      */
-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index row, Index col)
    {
      Base::check_coordinates_internal(row, col);
@@ -103,17 +96,13 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
    }

    /** \internal */
-    EIGEN_DEVICE_FUNC
    const MatrixTypeNestedCleaned& _expression() const { return m_matrix; }

-    EIGEN_DEVICE_FUNC
    const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
-    EIGEN_DEVICE_FUNC
    MatrixTypeNestedCleaned& nestedExpression() { return *const_cast<MatrixTypeNestedCleaned*>(&m_matrix); }

    /** Efficient self-adjoint matrix times vector/matrix product */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    SelfadjointProductMatrix<MatrixType,Mode,false,OtherDerived,0,OtherDerived::IsVectorAtCompileTime>
    operator*(const MatrixBase<OtherDerived>& rhs) const
    {
@@ -124,7 +113,6 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView

    /** Efficient vector/matrix times self-adjoint matrix product */
    template<typename OtherDerived> friend
-    EIGEN_DEVICE_FUNC
    SelfadjointProductMatrix<OtherDerived,0,OtherDerived::IsVectorAtCompileTime,MatrixType,Mode,false>
    operator*(const MatrixBase<OtherDerived>& lhs, const SelfAdjointView& rhs)
    {
@@ -144,7 +132,6 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
      * \sa rankUpdate(const MatrixBase<DerivedU>&, Scalar)
      */
    template<typename DerivedU, typename DerivedV>
-    EIGEN_DEVICE_FUNC
    SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha = Scalar(1));

    /** Perform a symmetric rank K update of the selfadjoint matrix \c *this:
@@ -158,7 +145,6 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
      * \sa rankUpdate(const MatrixBase<DerivedU>&, const MatrixBase<DerivedV>&, Scalar)
      */
    template<typename DerivedU>
-    EIGEN_DEVICE_FUNC
    SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));

 /////////// Cholesky module ///////////
@@ -173,10 +159,31 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
    /** Return type of eigenvalues() */
    typedef Matrix<RealScalar, internal::traits<MatrixType>::ColsAtCompileTime, 1> EigenvaluesReturnType;

-    EIGEN_DEVICE_FUNC
    EigenvaluesReturnType eigenvalues() const;
-    EIGEN_DEVICE_FUNC
    RealScalar operatorNorm() const;
+    
+    #ifdef EIGEN2_SUPPORT
+    template<typename OtherDerived>
+    SelfAdjointView& operator=(const MatrixBase<OtherDerived>& other)
+    {
+      enum {
+        OtherPart = UpLo == Upper ? StrictlyLower : StrictlyUpper
+      };
+      m_matrix.const_cast_derived().template triangularView<UpLo>() = other;
+      m_matrix.const_cast_derived().template triangularView<OtherPart>() = other.adjoint();
+      return *this;
+    }
+    template<typename OtherMatrixType, unsigned int OtherMode>
+    SelfAdjointView& operator=(const TriangularView<OtherMatrixType, OtherMode>& other)
+    {
+      enum {
+        OtherPart = UpLo == Upper ? StrictlyLower : StrictlyUpper
+      };
+      m_matrix.const_cast_derived().template triangularView<UpLo>() = other.toDenseMatrix();
+      m_matrix.const_cast_derived().template triangularView<OtherPart>() = other.toDenseMatrix().adjoint();
+      return *this;
+    }
+    #endif

  protected:
    MatrixTypeNested m_matrix;
@@ -202,7 +209,6 @@ struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Upper), U
    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
  };

-  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Upper), UnrollCount-1, ClearOpposite>::run(dst, src);
@@ -217,7 +223,6 @@ struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Upper), U
 template<typename Derived1, typename Derived2, bool ClearOpposite>
 struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Upper, 0, ClearOpposite>
 {
-  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &, const Derived2 &) {}
 };

@@ -229,7 +234,6 @@ struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Lower), U
    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
  };

-  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Lower), UnrollCount-1, ClearOpposite>::run(dst, src);
@@ -244,7 +248,6 @@ struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Lower), U
 template<typename Derived1, typename Derived2, bool ClearOpposite>
 struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Lower, 0, ClearOpposite>
 {
-  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &, const Derived2 &) {}
 };

@@ -252,7 +255,6 @@ template<typename Derived1, typename Derived2, bool ClearOpposite>
 struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Upper, Dynamic, ClearOpposite>
 {
  typedef typename Derived1::Index Index;
-  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    for(Index j = 0; j < dst.cols(); ++j)
@@ -270,7 +272,6 @@ struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Upper, Dyn
 template<typename Derived1, typename Derived2, bool ClearOpposite>
 struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Lower, Dynamic, ClearOpposite>
 {
-  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
  typedef typename Derived1::Index Index;
--- a/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h
@@ -35,7 +35,7 @@ struct traits<SelfCwiseBinaryOp<BinaryOp,Lhs,Rhs> >
  enum {
    // Note that it is still a good idea to preserve the DirectAccessBit
    // so that assign can correctly align the data.
-    Flags = traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> >::Flags | (Lhs::Flags&AlignedBit) | (Lhs::Flags&DirectAccessBit) | (Lhs::Flags&LvalueBit),
+    Flags = traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> >::Flags | (Lhs::Flags&DirectAccessBit) | (Lhs::Flags&LvalueBit),
    OuterStrideAtCompileTime = Lhs::OuterStrideAtCompileTime,
    InnerStrideAtCompileTime = Lhs::InnerStrideAtCompileTime
  };
@@ -52,24 +52,21 @@ template<typename BinaryOp, typename Lhs, typename Rhs> class SelfCwiseBinaryOp

    typedef typename internal::packet_traits<Scalar>::type Packet;

-    EIGEN_DEVICE_FUNC
    inline SelfCwiseBinaryOp(Lhs& xpr, const BinaryOp& func = BinaryOp()) : m_matrix(xpr), m_functor(func) {}

-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols(); }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_matrix.outerStride(); }
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_matrix.innerStride(); }
-    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_matrix.data(); }
+    inline Index rows() const { return m_matrix.rows(); }
+    inline Index cols() const { return m_matrix.cols(); }
+    inline Index outerStride() const { return m_matrix.outerStride(); }
+    inline Index innerStride() const { return m_matrix.innerStride(); }
+    inline const Scalar* data() const { return m_matrix.data(); }

    // note that this function is needed by assign to correctly align loads/stores
    // TODO make Assign use .data()
-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index row, Index col)
    {
      EIGEN_STATIC_ASSERT_LVALUE(Lhs)
      return m_matrix.const_cast_derived().coeffRef(row, col);
    }
-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index row, Index col) const
    {
      return m_matrix.coeffRef(row, col);
@@ -77,20 +74,17 @@ template<typename BinaryOp, typename Lhs, typename Rhs> class SelfCwiseBinaryOp

    // note that this function is needed by assign to correctly align loads/stores
    // TODO make Assign use .data()
-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index index)
    {
      EIGEN_STATIC_ASSERT_LVALUE(Lhs)
      return m_matrix.const_cast_derived().coeffRef(index);
    }
-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
      return m_matrix.const_cast_derived().coeffRef(index);
    }

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    void copyCoeff(Index row, Index col, const DenseBase<OtherDerived>& other)
    {
      OtherDerived& _other = other.const_cast_derived();
@@ -101,7 +95,6 @@ template<typename BinaryOp, typename Lhs, typename Rhs> class SelfCwiseBinaryOp
    }

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
    {
      OtherDerived& _other = other.const_cast_derived();
@@ -132,7 +125,6 @@ template<typename BinaryOp, typename Lhs, typename Rhs> class SelfCwiseBinaryOp
    // reimplement lazyAssign to handle complex *= real
    // see CwiseBinaryOp ctor for details
    template<typename RhsDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE SelfCwiseBinaryOp& lazyAssign(const DenseBase<RhsDerived>& rhs)
    {
      EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Lhs,RhsDerived)
@@ -152,20 +144,17 @@ template<typename BinaryOp, typename Lhs, typename Rhs> class SelfCwiseBinaryOp
    // overloaded to honor evaluation of special matrices
    // maybe another solution would be to not use SelfCwiseBinaryOp
    // at first...
-    EIGEN_DEVICE_FUNC
    SelfCwiseBinaryOp& operator=(const Rhs& _rhs)
    {
      typename internal::nested<Rhs>::type rhs(_rhs);
      return Base::operator=(rhs);
    }

-    EIGEN_DEVICE_FUNC
    Lhs& expression() const 
    { 
      return m_matrix;
    }

-    EIGEN_DEVICE_FUNC
    const BinaryOp& functor() const 
    { 
      return m_functor;
@@ -188,24 +177,6 @@ inline Derived& DenseBase<Derived>::operator*=(const Scalar& other)
  return derived();
 }

-template<typename Derived>
-inline Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
-{
-  typedef typename Derived::PlainObject PlainObject;
-  SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
-  tmp = PlainObject::Constant(rows(),cols(),other);
-  return derived();
-}
-
-template<typename Derived>
-inline Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
-{
-  typedef typename Derived::PlainObject PlainObject;
-  SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
-  tmp = PlainObject::Constant(rows(),cols(),other);
-  return derived();
-}
-
 template<typename Derived>
 inline Derived& DenseBase<Derived>::operator/=(const Scalar& other)
 {
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -20,7 +20,7 @@ inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& sc
  using std::max;
  Scalar maxCoeff = bl.cwiseAbs().maxCoeff();
  
-  if(maxCoeff>scale)
+  if (maxCoeff>scale)
  {
    ssq = ssq * numext::abs2(scale/maxCoeff);
    Scalar tmp = Scalar(1)/maxCoeff;
@@ -29,21 +29,12 @@ inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& sc
      invScale = NumTraits<Scalar>::highest();
      scale = Scalar(1)/invScale;
    }
-    else if(maxCoeff>NumTraits<Scalar>::highest()) // we got a INF
-    {
-      invScale = Scalar(1);
-      scale = maxCoeff;
-    }
    else
    {
      scale = maxCoeff;
      invScale = tmp;
    }
  }
-  else if(maxCoeff!=maxCoeff) // we got a NaN
-  {
-    scale = maxCoeff;
-  }
  
  // TODO if the maxCoeff is much much smaller than the current scale,
  // then we can neglect this sub vector
@@ -58,13 +49,13 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
  typedef typename Derived::RealScalar RealScalar;  
  typedef typename Derived::Index Index;
  using std::pow;
-  EIGEN_USING_STD_MATH(min);
-  EIGEN_USING_STD_MATH(max);
+  using std::min;
+  using std::max;
  using std::sqrt;
  using std::abs;
  const Derived& vec(_vec.derived());
  static bool initialized = false;
-  static RealScalar b1, b2, s1m, s2m, rbig, relerr;
+  static RealScalar b1, b2, s1m, s2m, overfl, rbig, relerr;
  if(!initialized)
  {
    int ibeta, it, iemin, iemax, iexp;
@@ -93,6 +84,7 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
    iexp  = - ((iemax+it)/2);
    s2m   = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // scaling factor for upper range

+    overfl  = rbig*s2m;                                             // overflow boundary for abig
    eps     = RealScalar(pow(double(ibeta), 1-it));
    relerr  = sqrt(eps);                                            // tolerance for neglecting asml
    initialized = true;
@@ -109,13 +101,13 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
    else if(ax < b1) asml += numext::abs2(ax*s1m);
    else             amed += numext::abs2(ax);
  }
-  if(amed!=amed)
-    return amed;  // we got a NaN
  if(abig > RealScalar(0))
  {
    abig = sqrt(abig);
-    if(abig > rbig) // overflow, or *this contains INF values
-      return abig;  // return INF
+    if(abig > overfl)
+    {
+      return rbig;
+    }
    if(amed > RealScalar(0))
    {
      abig = abig/s2m;
@@ -160,7 +152,7 @@ template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::stableNorm() const
 {
-  EIGEN_USING_STD_MATH(min);
+  using std::min;
  using std::sqrt;
  const Index blockSize = 4096;
  RealScalar scale(0);
--- a/Eigen/src/Core/Stride.h
+++ b/Eigen/src/Core/Stride.h
@@ -51,7 +51,6 @@ class Stride
    };

    /** Default constructor, for use when strides are fixed at compile time */
-    EIGEN_DEVICE_FUNC
    Stride()
      : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime)
    {
@@ -59,7 +58,6 @@ class Stride
    }

    /** Constructor allowing to pass the strides at runtime */
-    EIGEN_DEVICE_FUNC
    Stride(Index outerStride, Index innerStride)
      : m_outer(outerStride), m_inner(innerStride)
    {
@@ -67,16 +65,13 @@ class Stride
    }

    /** Copy constructor */
-    EIGEN_DEVICE_FUNC
    Stride(const Stride& other)
      : m_outer(other.outer()), m_inner(other.inner())
    {}

    /** \returns the outer stride */
-    EIGEN_DEVICE_FUNC
    inline Index outer() const { return m_outer.value(); }
    /** \returns the inner stride */
-    EIGEN_DEVICE_FUNC
    inline Index inner() const { return m_inner.value(); }

  protected:
@@ -92,8 +87,8 @@ class InnerStride : public Stride<0, Value>
    typedef Stride<0, Value> Base;
  public:
    typedef DenseIndex Index;
-    EIGEN_DEVICE_FUNC InnerStride() : Base() {}
-    EIGEN_DEVICE_FUNC InnerStride(Index v) : Base(0, v) {}
+    InnerStride() : Base() {}
+    InnerStride(Index v) : Base(0, v) {}
 };

 /** \brief Convenience specialization of Stride to specify only an outer stride
@@ -104,8 +99,8 @@ class OuterStride : public Stride<Value, 0>
    typedef Stride<Value, 0> Base;
  public:
    typedef DenseIndex Index;
-    EIGEN_DEVICE_FUNC OuterStride() : Base() {}
-    EIGEN_DEVICE_FUNC OuterStride(Index v) : Base(v,0) {}
+    OuterStride() : Base() {}
+    OuterStride(Index v) : Base(v,0) {}
 };

 } // end namespace Eigen
--- a/Eigen/src/Core/Swap.h
+++ b/Eigen/src/Core/Swap.h
@@ -33,16 +33,11 @@ template<typename ExpressionType> class SwapWrapper
    EIGEN_DENSE_PUBLIC_INTERFACE(SwapWrapper)
    typedef typename internal::packet_traits<Scalar>::type Packet;

-    EIGEN_DEVICE_FUNC
    inline SwapWrapper(ExpressionType& xpr) : m_expression(xpr) {}

-    EIGEN_DEVICE_FUNC
    inline Index rows() const { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC
    inline Index cols() const { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC
    inline Index outerStride() const { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC
    inline Index innerStride() const { return m_expression.innerStride(); }
    
    typedef typename internal::conditional<
@@ -51,37 +46,30 @@ template<typename ExpressionType> class SwapWrapper
                       const Scalar
                     >::type ScalarWithConstIfNotLvalue;
                     
-    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
-    EIGEN_DEVICE_FUNC
    inline const Scalar* data() const { return m_expression.data(); }

-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index rowId, Index colId)
    {
      return m_expression.const_cast_derived().coeffRef(rowId, colId);
    }

-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index index)
    {
      return m_expression.const_cast_derived().coeffRef(index);
    }

-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index rowId, Index colId) const
    {
      return m_expression.coeffRef(rowId, colId);
    }

-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index index) const
    {
      return m_expression.coeffRef(index);
    }

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    void copyCoeff(Index rowId, Index colId, const DenseBase<OtherDerived>& other)
    {
      OtherDerived& _other = other.const_cast_derived();
@@ -93,7 +81,6 @@ template<typename ExpressionType> class SwapWrapper
    }

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
    {
      OtherDerived& _other = other.const_cast_derived();
@@ -128,7 +115,6 @@ template<typename ExpressionType> class SwapWrapper
      _other.template writePacket<LoadMode>(index, tmp);
    }

-    EIGEN_DEVICE_FUNC
    ExpressionType& expression() const { return m_expression; }

  protected:
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -62,21 +62,18 @@ template<typename MatrixType> class Transpose
    typedef typename TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(Transpose)

-    EIGEN_DEVICE_FUNC
    inline Transpose(MatrixType& a_matrix) : m_matrix(a_matrix) {}

    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)

-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.cols(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.rows(); }
+    inline Index rows() const { return m_matrix.cols(); }
+    inline Index cols() const { return m_matrix.rows(); }

    /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
    const typename internal::remove_all<typename MatrixType::Nested>::type&
    nestedExpression() const { return m_matrix; }

    /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
    typename internal::remove_all<typename MatrixType::Nested>::type&
    nestedExpression() { return m_matrix.const_cast_derived(); }

@@ -109,8 +106,8 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
    EIGEN_DENSE_PUBLIC_INTERFACE(Transpose<MatrixType>)
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl)

-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return derived().nestedExpression().outerStride(); }
+    inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
+    inline Index outerStride() const { return derived().nestedExpression().outerStride(); }

    typedef typename internal::conditional<
                       internal::is_lvalue<MatrixType>::value,
@@ -121,39 +118,33 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
    inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
    inline const Scalar* data() const { return derived().nestedExpression().data(); }

-    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue& coeffRef(Index rowId, Index colId)
    {
      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
      return derived().nestedExpression().const_cast_derived().coeffRef(colId, rowId);
    }

-    EIGEN_DEVICE_FUNC
    inline ScalarWithConstIfNotLvalue& coeffRef(Index index)
    {
      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
      return derived().nestedExpression().const_cast_derived().coeffRef(index);
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
      return derived().nestedExpression().coeffRef(colId, rowId);
    }

-    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
      return derived().nestedExpression().coeffRef(index);
    }

-    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index rowId, Index colId) const
    {
      return derived().nestedExpression().coeff(colId, rowId);
    }

-    EIGEN_DEVICE_FUNC
    inline CoeffReturnType coeff(Index index) const
    {
      return derived().nestedExpression().coeff(index);
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@@ -53,8 +53,7 @@ class TranspositionsBase
  public:

    typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar StorageIndexType;
-    typedef typename IndicesType::Index  Index;
+    typedef typename IndicesType::Scalar Index;

    Derived& derived() { return *static_cast<Derived*>(this); }
    const Derived& derived() const { return *static_cast<const Derived*>(this); }
@@ -82,17 +81,17 @@ class TranspositionsBase
    inline Index size() const { return indices().size(); }

    /** Direct access to the underlying index vector */
-    inline const StorageIndexType& coeff(Index i) const { return indices().coeff(i); }
+    inline const Index& coeff(Index i) const { return indices().coeff(i); }
    /** Direct access to the underlying index vector */
-    inline StorageIndexType& coeffRef(Index i) { return indices().coeffRef(i); }
+    inline Index& coeffRef(Index i) { return indices().coeffRef(i); }
    /** Direct access to the underlying index vector */
-    inline const StorageIndexType& operator()(Index i) const { return indices()(i); }
+    inline const Index& operator()(Index i) const { return indices()(i); }
    /** Direct access to the underlying index vector */
-    inline StorageIndexType& operator()(Index i) { return indices()(i); }
+    inline Index& operator()(Index i) { return indices()(i); }
    /** Direct access to the underlying index vector */
-    inline const StorageIndexType& operator[](Index i) const { return indices()(i); }
+    inline const Index& operator[](Index i) const { return indices()(i); }
    /** Direct access to the underlying index vector */
-    inline StorageIndexType& operator[](Index i) { return indices()(i); }
+    inline Index& operator[](Index i) { return indices()(i); }

    /** const version of indices(). */
    const IndicesType& indices() const { return derived().indices(); }
@@ -100,7 +99,7 @@ class TranspositionsBase
    IndicesType& indices() { return derived().indices(); }

    /** Resizes to given size. */
-    inline void resize(Index newSize)
+    inline void resize(int newSize)
    {
      indices().resize(newSize);
    }
@@ -108,7 +107,7 @@ class TranspositionsBase
    /** Sets \c *this to represents an identity transformation */
    void setIdentity()
    {
-      for(StorageIndexType i = 0; i < indices().size(); ++i)
+      for(int i = 0; i < indices().size(); ++i)
        coeffRef(i) = i;
    }

@@ -145,26 +144,23 @@ class TranspositionsBase
 };

 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndexType>
-struct traits<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndexType> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
+struct traits<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType> >
 {
-  typedef Matrix<_StorageIndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
-  typedef typename IndicesType::Index Index;
-  typedef _StorageIndexType StorageIndexType;
+  typedef IndexType Index;
+  typedef Matrix<Index, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
 };
 }

-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndexType>
-class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndexType> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
+class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType> >
 {
    typedef internal::traits<Transpositions> Traits;
  public:

    typedef TranspositionsBase<Transpositions> Base;
    typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar StorageIndexType;
-    typedef typename IndicesType::Index  Index;
-    
+    typedef typename IndicesType::Scalar Index;

    inline Transpositions() {}

@@ -219,32 +215,30 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim


 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndexType, int _PacketAccess>
-struct traits<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndexType>,_PacketAccess> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
+struct traits<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,_PacketAccess> >
 {
-  typedef Map<const Matrix<_StorageIndexType,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1>, _PacketAccess> IndicesType;
-  typedef typename IndicesType::Index Index;
-  typedef _StorageIndexType StorageIndexType;
+  typedef IndexType Index;
+  typedef Map<const Matrix<Index,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1>, _PacketAccess> IndicesType;
 };
 }

-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndexType, int PacketAccess>
-class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndexType>,PacketAccess>
- : public TranspositionsBase<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndexType>,PacketAccess> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int PacketAccess>
+class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,PacketAccess>
+ : public TranspositionsBase<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,PacketAccess> >
 {
    typedef internal::traits<Map> Traits;
  public:

    typedef TranspositionsBase<Map> Base;
    typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar StorageIndexType;
-    typedef typename IndicesType::Index  Index;
+    typedef typename IndicesType::Scalar Index;

-    inline Map(const StorageIndexType* indicesPtr)
+    inline Map(const Index* indicesPtr)
      : m_indices(indicesPtr)
    {}

-    inline Map(const StorageIndexType* indicesPtr, Index size)
+    inline Map(const Index* indicesPtr, Index size)
      : m_indices(indicesPtr,size)
    {}

@@ -281,8 +275,7 @@ namespace internal {
 template<typename _IndicesType>
 struct traits<TranspositionsWrapper<_IndicesType> >
 {
-  typedef typename _IndicesType::Scalar StorageIndexType;
-  typedef typename _IndicesType::Index Index;
+  typedef typename _IndicesType::Scalar Index;
  typedef _IndicesType IndicesType;
 };
 }
@@ -296,8 +289,7 @@ class TranspositionsWrapper

    typedef TranspositionsBase<TranspositionsWrapper> Base;
    typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar StorageIndexType;
-    typedef typename IndicesType::Index  Index;
+    typedef typename IndicesType::Scalar Index;

    inline TranspositionsWrapper(IndicesType& a_indices)
      : m_indices(a_indices)
@@ -371,25 +363,24 @@ struct transposition_matrix_product_retval
 {
    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
    typedef typename TranspositionType::Index Index;
-    typedef typename TranspositionType::StorageIndexType StorageIndexType;

    transposition_matrix_product_retval(const TranspositionType& tr, const MatrixType& matrix)
      : m_transpositions(tr), m_matrix(matrix)
    {}

-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
+    inline int rows() const { return m_matrix.rows(); }
+    inline int cols() const { return m_matrix.cols(); }

    template<typename Dest> inline void evalTo(Dest& dst) const
    {
-      const Index size = m_transpositions.size();
-      StorageIndexType j = 0;
+      const int size = m_transpositions.size();
+      Index j = 0;

      if(!(is_same<MatrixTypeNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_matrix)))
        dst = m_matrix;

-      for(Index k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k)
-        if(Index(j=m_transpositions.coeff(k))!=k)
+      for(int k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k)
+        if((j=m_transpositions.coeff(k))!=k)
        {
          if(Side==OnTheLeft)
            dst.row(k).swap(dst.row(j));
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -44,39 +44,29 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
    typedef typename internal::traits<Derived>::DenseMatrixType DenseMatrixType;
    typedef DenseMatrixType DenseType;

-    EIGEN_DEVICE_FUNC
    inline TriangularBase() { eigen_assert(!((Mode&UnitDiag) && (Mode&ZeroDiag))); }

-    EIGEN_DEVICE_FUNC
    inline Index rows() const { return derived().rows(); }
-    EIGEN_DEVICE_FUNC
    inline Index cols() const { return derived().cols(); }
-    EIGEN_DEVICE_FUNC
    inline Index outerStride() const { return derived().outerStride(); }
-    EIGEN_DEVICE_FUNC
    inline Index innerStride() const { return derived().innerStride(); }

-    EIGEN_DEVICE_FUNC
    inline Scalar coeff(Index row, Index col) const  { return derived().coeff(row,col); }
-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index row, Index col) { return derived().coeffRef(row,col); }

    /** \see MatrixBase::copyCoeff(row,col)
      */
    template<typename Other>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void copyCoeff(Index row, Index col, Other& other)
    {
      derived().coeffRef(row, col) = other.coeff(row, col);
    }

-    EIGEN_DEVICE_FUNC
    inline Scalar operator()(Index row, Index col) const
    {
      check_coordinates(row, col);
      return coeff(row,col);
    }
-    EIGEN_DEVICE_FUNC
    inline Scalar& operator()(Index row, Index col)
    {
      check_coordinates(row, col);
@@ -84,20 +74,15 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
    }

    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    EIGEN_DEVICE_FUNC
    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
-    EIGEN_DEVICE_FUNC
    inline Derived& derived() { return *static_cast<Derived*>(this); }
    #endif // not EIGEN_PARSED_BY_DOXYGEN

    template<typename DenseDerived>
-    EIGEN_DEVICE_FUNC
    void evalTo(MatrixBase<DenseDerived> &other) const;
    template<typename DenseDerived>
-    EIGEN_DEVICE_FUNC
    void evalToLazy(MatrixBase<DenseDerived> &other) const;

-    EIGEN_DEVICE_FUNC
    DenseMatrixType toDenseMatrix() const
    {
      DenseMatrixType res(rows(), cols());
@@ -204,52 +189,36 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
                    | (Mode & (ZeroDiag))
    };

-    EIGEN_DEVICE_FUNC
    inline TriangularView(const MatrixType& matrix) : m_matrix(matrix)
    {}

-    EIGEN_DEVICE_FUNC
    inline Index rows() const { return m_matrix.rows(); }
-    EIGEN_DEVICE_FUNC
    inline Index cols() const { return m_matrix.cols(); }
-    EIGEN_DEVICE_FUNC
    inline Index outerStride() const { return m_matrix.outerStride(); }
-    EIGEN_DEVICE_FUNC
    inline Index innerStride() const { return m_matrix.innerStride(); }

-    /** \sa MatrixBase::operator+=() */    
-    template<typename Other>
-    EIGEN_DEVICE_FUNC
-    TriangularView&  operator+=(const DenseBase<Other>& other) { return *this = m_matrix + other.derived(); }
+    /** \sa MatrixBase::operator+=() */
+    template<typename Other> TriangularView&  operator+=(const DenseBase<Other>& other) { return *this = m_matrix + other.derived(); }
    /** \sa MatrixBase::operator-=() */
-    template<typename Other>
-    EIGEN_DEVICE_FUNC
-    TriangularView&  operator-=(const DenseBase<Other>& other) { return *this = m_matrix - other.derived(); }
+    template<typename Other> TriangularView&  operator-=(const DenseBase<Other>& other) { return *this = m_matrix - other.derived(); }
    /** \sa MatrixBase::operator*=() */
-    EIGEN_DEVICE_FUNC
    TriangularView&  operator*=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = m_matrix * other; }
    /** \sa MatrixBase::operator/=() */
-    EIGEN_DEVICE_FUNC
    TriangularView&  operator/=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = m_matrix / other; }

    /** \sa MatrixBase::fill() */
-    EIGEN_DEVICE_FUNC
    void fill(const Scalar& value) { setConstant(value); }
    /** \sa MatrixBase::setConstant() */
-    EIGEN_DEVICE_FUNC
    TriangularView& setConstant(const Scalar& value)
    { return *this = MatrixType::Constant(rows(), cols(), value); }
    /** \sa MatrixBase::setZero() */
-    EIGEN_DEVICE_FUNC
    TriangularView& setZero() { return setConstant(Scalar(0)); }
    /** \sa MatrixBase::setOnes() */
-    EIGEN_DEVICE_FUNC
    TriangularView& setOnes() { return setConstant(Scalar(1)); }

    /** \sa MatrixBase::coeff()
      * \warning the coordinates must fit into the referenced triangular part
      */
-    EIGEN_DEVICE_FUNC
    inline Scalar coeff(Index row, Index col) const
    {
      Base::check_coordinates_internal(row, col);
@@ -259,62 +228,49 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
    /** \sa MatrixBase::coeffRef()
      * \warning the coordinates must fit into the referenced triangular part
      */
-    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index row, Index col)
    {
      Base::check_coordinates_internal(row, col);
      return m_matrix.const_cast_derived().coeffRef(row, col);
    }

-    EIGEN_DEVICE_FUNC
    const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
-    EIGEN_DEVICE_FUNC
    MatrixTypeNestedCleaned& nestedExpression() { return *const_cast<MatrixTypeNestedCleaned*>(&m_matrix); }

    /** Assigns a triangular matrix to a triangular part of a dense matrix */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    TriangularView& operator=(const TriangularBase<OtherDerived>& other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    TriangularView& operator=(const MatrixBase<OtherDerived>& other);

-    EIGEN_DEVICE_FUNC
    TriangularView& operator=(const TriangularView& other)
    { return *this = other.nestedExpression(); }

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    void lazyAssign(const TriangularBase<OtherDerived>& other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    void lazyAssign(const MatrixBase<OtherDerived>& other);

    /** \sa MatrixBase::conjugate() */
-    EIGEN_DEVICE_FUNC
    inline TriangularView<MatrixConjugateReturnType,Mode> conjugate()
    { return m_matrix.conjugate(); }
    /** \sa MatrixBase::conjugate() const */
-    EIGEN_DEVICE_FUNC
    inline const TriangularView<MatrixConjugateReturnType,Mode> conjugate() const
    { return m_matrix.conjugate(); }

    /** \sa MatrixBase::adjoint() const */
-    EIGEN_DEVICE_FUNC
    inline const TriangularView<const typename MatrixType::AdjointReturnType,TransposeMode> adjoint() const
    { return m_matrix.adjoint(); }

    /** \sa MatrixBase::transpose() */
-    EIGEN_DEVICE_FUNC
    inline TriangularView<Transpose<MatrixType>,TransposeMode> transpose()
    {
      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
      return m_matrix.const_cast_derived().transpose();
    }
    /** \sa MatrixBase::transpose() const */
-    EIGEN_DEVICE_FUNC
    inline const TriangularView<Transpose<MatrixType>,TransposeMode> transpose() const
    {
      return m_matrix.transpose();
@@ -322,7 +278,6 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView

    /** Efficient triangular matrix times vector/matrix product */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    TriangularProduct<Mode, true, MatrixType, false, OtherDerived, OtherDerived::ColsAtCompileTime==1>
    operator*(const MatrixBase<OtherDerived>& rhs) const
    {
@@ -333,7 +288,6 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView

    /** Efficient vector/matrix times triangular matrix product */
    template<typename OtherDerived> friend
-    EIGEN_DEVICE_FUNC
    TriangularProduct<Mode, false, OtherDerived, OtherDerived::RowsAtCompileTime==1, MatrixType, false>
    operator*(const MatrixBase<OtherDerived>& lhs, const TriangularView& rhs)
    {
@@ -342,33 +296,56 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
              (lhs.derived(),rhs.m_matrix);
    }

+    #ifdef EIGEN2_SUPPORT
+    template<typename OtherDerived>
+    struct eigen2_product_return_type
+    {
+      typedef typename TriangularView<MatrixType,Mode>::DenseMatrixType DenseMatrixType;
+      typedef typename OtherDerived::PlainObject::DenseType OtherPlainObject;
+      typedef typename ProductReturnType<DenseMatrixType, OtherPlainObject>::Type ProdRetType;
+      typedef typename ProdRetType::PlainObject type;
+    };
+    template<typename OtherDerived>
+    const typename eigen2_product_return_type<OtherDerived>::type
+    operator*(const EigenBase<OtherDerived>& rhs) const
+    {
+      typename OtherDerived::PlainObject::DenseType rhsPlainObject;
+      rhs.evalTo(rhsPlainObject);
+      return this->toDenseMatrix() * rhsPlainObject;
+    }
+    template<typename OtherMatrixType>
+    bool isApprox(const TriangularView<OtherMatrixType, Mode>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
+    {
+      return this->toDenseMatrix().isApprox(other.toDenseMatrix(), precision);
+    }
+    template<typename OtherDerived>
+    bool isApprox(const MatrixBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
+    {
+      return this->toDenseMatrix().isApprox(other, precision);
+    }
+    #endif // EIGEN2_SUPPORT
+
    template<int Side, typename Other>
-    EIGEN_DEVICE_FUNC
    inline const internal::triangular_solve_retval<Side,TriangularView, Other>
    solve(const MatrixBase<Other>& other) const;

    template<int Side, typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    void solveInPlace(const MatrixBase<OtherDerived>& other) const;

    template<typename Other>
-    EIGEN_DEVICE_FUNC
    inline const internal::triangular_solve_retval<OnTheLeft,TriangularView, Other> 
    solve(const MatrixBase<Other>& other) const
    { return solve<OnTheLeft>(other); }

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    void solveInPlace(const MatrixBase<OtherDerived>& other) const
    { return solveInPlace<OnTheLeft>(other); }

-    EIGEN_DEVICE_FUNC
    const SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView() const
    {
      EIGEN_STATIC_ASSERT((Mode&UnitDiag)==0,PROGRAMMING_ERROR);
      return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
    }
-    EIGEN_DEVICE_FUNC
    SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView()
    {
      EIGEN_STATIC_ASSERT((Mode&UnitDiag)==0,PROGRAMMING_ERROR);
@@ -376,21 +353,18 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
    }

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    void swap(TriangularBase<OtherDerived> const & other)
    {
      TriangularView<SwapWrapper<MatrixType>,Mode>(const_cast<MatrixType&>(m_matrix)).lazyAssign(other.derived());
    }

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
    void swap(MatrixBase<OtherDerived> const & other)
    {
      SwapWrapper<MatrixType> swaper(const_cast<MatrixType&>(m_matrix));
      TriangularView<SwapWrapper<MatrixType>,Mode>(swaper).lazyAssign(other.derived());
    }

-    EIGEN_DEVICE_FUNC
    Scalar determinant() const
    {
      if (Mode & UnitDiag)
@@ -403,55 +377,57 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
    
    // TODO simplify the following:
    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE TriangularView& operator=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
    {
      setZero();
-      return assignProduct(other,1);
+      return assignProduct(other.derived(),1);
    }
    
    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE TriangularView& operator+=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
    {
-      return assignProduct(other,1);
+      return assignProduct(other.derived(),1);
    }
    
    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE TriangularView& operator-=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
    {
-      return assignProduct(other,-1);
+      return assignProduct(other.derived(),-1);
    }
    
    
    template<typename ProductDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE TriangularView& operator=(const ScaledProduct<ProductDerived>& other)
    {
      setZero();
-      return assignProduct(other,other.alpha());
+      return assignProduct(other.derived(),other.alpha());
    }
    
    template<typename ProductDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE TriangularView& operator+=(const ScaledProduct<ProductDerived>& other)
    {
-      return assignProduct(other,other.alpha());
+      return assignProduct(other.derived(),other.alpha());
    }
    
    template<typename ProductDerived>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE TriangularView& operator-=(const ScaledProduct<ProductDerived>& other)
    {
-      return assignProduct(other,-other.alpha());
+      return assignProduct(other.derived(),-other.alpha());
    }
    
  protected:
    
    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE TriangularView& assignProduct(const ProductBase<ProductDerived, Lhs,Rhs>& prod, const Scalar& alpha);
+    
+    template<int Mode, bool LhsIsTriangular,
+         typename Lhs, bool LhsIsVector,
+         typename Rhs, bool RhsIsVector>
+    EIGEN_STRONG_INLINE TriangularView& assignProduct(const TriangularProduct<Mode, LhsIsTriangular, Lhs, LhsIsVector, Rhs, RhsIsVector>& prod, const Scalar& alpha)
+    {
+      lazyAssign(alpha*prod.eval());
+      return *this;
+    }

    MatrixTypeNested m_matrix;
 };
@@ -472,7 +448,6 @@ struct triangular_assignment_selector
  
  typedef typename Derived1::Scalar Scalar;

-  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    triangular_assignment_selector<Derived1, Derived2, Mode, UnrollCount-1, ClearOpposite>::run(dst, src);
@@ -501,7 +476,6 @@ struct triangular_assignment_selector
 template<typename Derived1, typename Derived2, unsigned int Mode, bool ClearOpposite>
 struct triangular_assignment_selector<Derived1, Derived2, Mode, 0, ClearOpposite>
 {
-  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &, const Derived2 &) {}
 };

@@ -510,7 +484,6 @@ struct triangular_assignment_selector<Derived1, Derived2, Upper, Dynamic, ClearO
 {
  typedef typename Derived1::Index Index;
  typedef typename Derived1::Scalar Scalar;
-  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    for(Index j = 0; j < dst.cols(); ++j)
@@ -529,7 +502,6 @@ template<typename Derived1, typename Derived2, bool ClearOpposite>
 struct triangular_assignment_selector<Derived1, Derived2, Lower, Dynamic, ClearOpposite>
 {
  typedef typename Derived1::Index Index;
-  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    for(Index j = 0; j < dst.cols(); ++j)
@@ -549,7 +521,6 @@ struct triangular_assignment_selector<Derived1, Derived2, StrictlyUpper, Dynamic
 {
  typedef typename Derived1::Index Index;
  typedef typename Derived1::Scalar Scalar;
-  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    for(Index j = 0; j < dst.cols(); ++j)
@@ -568,7 +539,6 @@ template<typename Derived1, typename Derived2, bool ClearOpposite>
 struct triangular_assignment_selector<Derived1, Derived2, StrictlyLower, Dynamic, ClearOpposite>
 {
  typedef typename Derived1::Index Index;
-  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    for(Index j = 0; j < dst.cols(); ++j)
@@ -587,7 +557,6 @@ template<typename Derived1, typename Derived2, bool ClearOpposite>
 struct triangular_assignment_selector<Derived1, Derived2, UnitUpper, Dynamic, ClearOpposite>
 {
  typedef typename Derived1::Index Index;
-  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    for(Index j = 0; j < dst.cols(); ++j)
@@ -608,7 +577,6 @@ template<typename Derived1, typename Derived2, bool ClearOpposite>
 struct triangular_assignment_selector<Derived1, Derived2, UnitLower, Dynamic, ClearOpposite>
 {
  typedef typename Derived1::Index Index;
-  EIGEN_DEVICE_FUNC
  static inline void run(Derived1 &dst, const Derived2 &src)
  {
    for(Index j = 0; j < dst.cols(); ++j)
@@ -751,6 +719,41 @@ void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
 * Implementation of MatrixBase methods
 ***************************************************************************/

+#ifdef EIGEN2_SUPPORT
+
+// implementation of part<>(), including the SelfAdjoint case.
+
+namespace internal {
+template<typename MatrixType, unsigned int Mode>
+struct eigen2_part_return_type
+{
+  typedef TriangularView<MatrixType, Mode> type;
+};
+
+template<typename MatrixType>
+struct eigen2_part_return_type<MatrixType, SelfAdjoint>
+{
+  typedef SelfAdjointView<MatrixType, Upper> type;
+};
+}
+
+/** \deprecated use MatrixBase::triangularView() */
+template<typename Derived>
+template<unsigned int Mode>
+const typename internal::eigen2_part_return_type<Derived, Mode>::type MatrixBase<Derived>::part() const
+{
+  return derived();
+}
+
+/** \deprecated use MatrixBase::triangularView() */
+template<typename Derived>
+template<unsigned int Mode>
+typename internal::eigen2_part_return_type<Derived, Mode>::type MatrixBase<Derived>::part()
+{
+  return derived();
+}
+#endif
+
 /**
  * \returns an expression of a triangular view extracted from the current matrix
  *
--- a/Eigen/src/Core/VectorBlock.h
+++ b/Eigen/src/Core/VectorBlock.h
@@ -72,7 +72,6 @@ template<typename VectorType, int Size> class VectorBlock

    /** Dynamic-size constructor
      */
-    EIGEN_DEVICE_FUNC
    inline VectorBlock(VectorType& vector, Index start, Index size)
      : Base(vector,
             IsColVector ? start : 0, IsColVector ? 0 : start,
@@ -83,7 +82,6 @@ template<typename VectorType, int Size> class VectorBlock

    /** Fixed-size constructor
      */
-    EIGEN_DEVICE_FUNC
    inline VectorBlock(VectorType& vector, Index start)
      : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start)
    {
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -302,7 +302,6 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

    /** \returns a row (or column) vector expression of the squared norm
      * of each column (or row) of the referenced expression.
-      * This is a vector with real entries, even if the original matrix has complex entries.
      *
      * Example: \include PartialRedux_squaredNorm.cpp
      * Output: \verbinclude PartialRedux_squaredNorm.out
@@ -313,7 +312,6 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

    /** \returns a row (or column) vector expression of the norm
      * of each column (or row) of the referenced expression.
-      * This is a vector with real entries, even if the original matrix has complex entries.
      *
      * Example: \include PartialRedux_norm.cpp
      * Output: \verbinclude PartialRedux_norm.out
@@ -325,8 +323,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

    /** \returns a row (or column) vector expression of the norm
      * of each column (or row) of the referenced expression, using
-      * Blue's algorithm. 
-      * This is a vector with real entries, even if the original matrix has complex entries.
+      * blue's algorithm.
      *
      * \sa DenseBase::blueNorm() */
    const typename ReturnType<internal::member_blueNorm,RealScalar>::Type blueNorm() const
@@ -336,7 +333,6 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    /** \returns a row (or column) vector expression of the norm
      * of each column (or row) of the referenced expression, avoiding
      * underflow and overflow.
-      * This is a vector with real entries, even if the original matrix has complex entries.
      *
      * \sa DenseBase::stableNorm() */
    const typename ReturnType<internal::member_stableNorm,RealScalar>::Type stableNorm() const
@@ -346,7 +342,6 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    /** \returns a row (or column) vector expression of the norm
      * of each column (or row) of the referenced expression, avoiding
      * underflow and overflow using a concatenation of hypot() calls.
-      * This is a vector with real entries, even if the original matrix has complex entries.
      *
      * \sa DenseBase::hypotNorm() */
    const typename ReturnType<internal::member_hypotNorm,RealScalar>::Type hypotNorm() const
@@ -371,7 +366,6 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

    /** \returns a row (or column) vector expression representing
      * whether \b all coefficients of each respective column (or row) are \c true.
-      * This expression can be assigned to a vector with entries of type \c bool.
      *
      * \sa DenseBase::all() */
    const typename ReturnType<internal::member_all>::Type all() const
@@ -379,7 +373,6 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

    /** \returns a row (or column) vector expression representing
      * whether \b at \b least one coefficient of each respective column (or row) is \c true.
-      * This expression can be assigned to a vector with entries of type \c bool.
      *
      * \sa DenseBase::any() */
    const typename ReturnType<internal::member_any>::Type any() const
@@ -387,8 +380,6 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

    /** \returns a row (or column) vector expression representing
      * the number of \c true coefficients of each respective column (or row).
-      * This expression can be assigned to a vector whose entries have the same type as is used to
-      * index entries of the original matrix; for dense matrices, this is \c std::ptrdiff_t .
      *
      * Example: \include PartialRedux_count.cpp
      * Output: \verbinclude PartialRedux_count.out
@@ -560,7 +551,9 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

 /////////// Geometry module ///////////

+    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
    Homogeneous<ExpressionType,Direction> homogeneous() const;
+    #endif

    typedef typename ExpressionType::PlainObject CrossReturnType;
    template<typename OtherDerived>
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -194,7 +194,7 @@ DenseBase<Derived>::minCoeff(IndexType* index) const
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  internal::min_coeff_visitor<Derived> minVisitor;
  this->visit(minVisitor);
-  *index = IndexType((RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row);
+  *index = (RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row;
  return minVisitor.res;
 }

--- a/Eigen/src/Core/arch/AVX/CMakeLists.txt
+++ b/Eigen/src/Core/arch/AVX/CMakeLists.txt
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Core_arch_AVX_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Core_arch_AVX_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/AVX COMPONENT Devel
-)
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -1,463 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_COMPLEX_AVX_H
-#define EIGEN_COMPLEX_AVX_H
-
-namespace Eigen {
-
-namespace internal {
-
-//---------- float ----------
-struct Packet4cf
-{
-  EIGEN_STRONG_INLINE Packet4cf() {}
-  EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {}
-  __m256  v;
-};
-
-template<> struct packet_traits<std::complex<float> >  : default_packet_traits
-{
-  typedef Packet4cf type;
-  typedef Packet2cf half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 4,
-    HasHalfPacket = 1,
-
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasSetLinear = 0
-  };
-};
-
-template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4}; typedef Packet2cf half; };
-
-template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a)
-{
-  return Packet4cf(pnegate(a.v));
-}
-template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a)
-{
-  const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
-  return Packet4cf(_mm256_xor_ps(a.v,mask));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
-{
-  __m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v);
-  __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
-  __m256 result = _mm256_addsub_ps(tmp1, tmp2);
-  return Packet4cf(result);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cf pand   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf por    <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf pxor   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); }
-
-template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); }
-template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); }
-
-
-template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
-{
-  return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
-{
-  // FIXME The following might be optimized using _mm256_movedup_pd
-  Packet2cf a = ploaddup<Packet2cf>(from);
-  Packet2cf b = ploaddup<Packet2cf>(from+1);
-  return  Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1));
-}
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from, DenseIndex stride)
-{
-  return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]),
-                                 std::imag(from[2*stride]), std::real(from[2*stride]),
-                                 std::imag(from[1*stride]), std::real(from[1*stride]),
-                                 std::imag(from[0*stride]), std::real(from[0*stride])));
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, DenseIndex stride)
-{
-  __m128 low = _mm256_extractf128_ps(from.v, 0);
-  to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)),
-                                     _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
-  to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)),
-                                     _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
-
-  __m128 high = _mm256_extractf128_ps(from.v, 1);
-  to[stride*2] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)),
-                                     _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
-  to[stride*3] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)),
-                                     _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
-
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet4cf>(const Packet4cf& a)
-{
-  return pfirst(Packet2cf(_mm256_castps256_ps128(a.v)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) {
-  __m128 low  = _mm256_extractf128_ps(a.v, 0);
-  __m128 high = _mm256_extractf128_ps(a.v, 1);
-  __m128d lowd  = _mm_castps_pd(low);
-  __m128d highd = _mm_castps_pd(high);
-  low  = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1));
-  high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1));
-  __m256 result = _mm256_setzero_ps();
-  result = _mm256_insertf128_ps(result, low, 1);
-  result = _mm256_insertf128_ps(result, high, 0);
-  return Packet4cf(result);
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a)
-{
-  return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v,0)),
-                     Packet2cf(_mm256_extractf128_ps(a.v,1))));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cf preduxp<Packet4cf>(const Packet4cf* vecs)
-{
-  Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0));
-  Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0));
-  t0 = _mm256_hadd_ps(t0,t1);
-  Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0));
-  Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0));
-  t2 = _mm256_hadd_ps(t2,t3);
-  
-  t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4));
-  t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4));
-
-  return Packet4cf(_mm256_add_ps(t1,t3));
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a)
-{
-  return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)),
-                         Packet2cf(_mm256_extractf128_ps(a.v, 1))));
-}
-
-template<int Offset>
-struct palign_impl<Offset,Packet4cf>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second)
-  {
-    if (Offset==0) return;
-    palign_impl<Offset*2,Packet8f>::run(first.v, second.v);
-  }
-};
-
-template<> struct conj_helper<Packet4cf, Packet4cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet4cf, Packet4cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
-template<> struct conj_helper<Packet8f, Packet4cf, false,false>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
-  { return Packet4cf(Eigen::internal::pmul(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet4cf, Packet8f, false,false>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
-  { return Packet4cf(Eigen::internal::pmul(x.v, y)); }
-};
-
-template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
-{
-  Packet4cf num = pmul(a, pconj(b));
-  __m256 tmp = _mm256_mul_ps(b.v, b.v);
-  __m256 tmp2    = _mm256_shuffle_ps(tmp,tmp,0xB1);
-  __m256 denom = _mm256_add_ps(tmp, tmp2);
-  return Packet4cf(_mm256_div_ps(num.v, denom));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x)
-{
-  return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
-}
-
-//---------- double ----------
-struct Packet2cd
-{
-  EIGEN_STRONG_INLINE Packet2cd() {}
-  EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {}
-  __m256d  v;
-};
-
-template<> struct packet_traits<std::complex<double> >  : default_packet_traits
-{
-  typedef Packet2cd type;
-  typedef Packet1cd half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 0,
-    size = 2,
-    HasHalfPacket = 1,
-
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasSetLinear = 0
-  };
-};
-
-template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2}; typedef Packet1cd half; };
-
-template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a)
-{
-  const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
-  return Packet2cd(_mm256_xor_pd(a.v,mask));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
-{
-  __m256d tmp1 = _mm256_shuffle_pd(a.v,a.v,0x0);
-  __m256d even = _mm256_mul_pd(tmp1, b.v);
-  __m256d tmp2 = _mm256_shuffle_pd(a.v,a.v,0xF);
-  __m256d tmp3 = _mm256_shuffle_pd(b.v,b.v,0x5);
-  __m256d odd  = _mm256_mul_pd(tmp2, tmp3);
-  return Packet2cd(_mm256_addsub_pd(even, odd));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd pand   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd por    <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pxor   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from)
-{
-  // in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though)
-//   return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from));
-    return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) { return pset1<Packet2cd>(*from); }
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from, DenseIndex stride)
-{
-  return Packet2cd(_mm256_set_pd(std::imag(from[1*stride]), std::real(from[1*stride]),
-				 std::imag(from[0*stride]), std::real(from[0*stride])));
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from, DenseIndex stride)
-{
-  __m128d low = _mm256_extractf128_pd(from.v, 0);
-  to[stride*0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)));
-  __m128d high = _mm256_extractf128_pd(from.v, 1);
-  to[stride*1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)));
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a)
-{
-  __m128d low = _mm256_extractf128_pd(a.v, 0);
-  EIGEN_ALIGN16 double res[2];
-  _mm_store_pd(res, low);
-  return std::complex<double>(res[0],res[1]);
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) {
-  __m256d result = _mm256_permute2f128_pd(a.v, a.v, 1);
-  return Packet2cd(result);
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a)
-{
-  return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v,0)),
-                     Packet1cd(_mm256_extractf128_pd(a.v,1))));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd preduxp<Packet2cd>(const Packet2cd* vecs)
-{
-  Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4));
-  Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4));
-
-  return Packet2cd(_mm256_add_pd(t0,t1));
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a)
-{
-  return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)),
-                     Packet1cd(_mm256_extractf128_pd(a.v,1))));
-}
-
-template<int Offset>
-struct palign_impl<Offset,Packet2cd>
-{
-  static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second)
-  {
-    if (Offset==0) return;
-    palign_impl<Offset*2,Packet4d>::run(first.v, second.v);
-  }
-};
-
-template<> struct conj_helper<Packet2cd, Packet2cd, false,true>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet2cd, Packet2cd, true,false>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
-template<> struct conj_helper<Packet4d, Packet2cd, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
-  { return Packet2cd(Eigen::internal::pmul(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet2cd, Packet4d, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
-  { return Packet2cd(Eigen::internal::pmul(x.v, y)); }
-};
-
-template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
-{
-  Packet2cd num = pmul(a, pconj(b));
-  __m256d tmp = _mm256_mul_pd(b.v, b.v);
-  __m256d denom = _mm256_hadd_pd(tmp, tmp);
-  return Packet2cd(_mm256_div_pd(num.v, denom));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x)
-{
-  return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5));
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4cf,4>& kernel) {
-  __m256d P0 = _mm256_castps_pd(kernel.packet[0].v);
-  __m256d P1 = _mm256_castps_pd(kernel.packet[1].v);
-  __m256d P2 = _mm256_castps_pd(kernel.packet[2].v);
-  __m256d P3 = _mm256_castps_pd(kernel.packet[3].v);
-
-  __m256d T0 = _mm256_shuffle_pd(P0, P1, 15);
-  __m256d T1 = _mm256_shuffle_pd(P0, P1, 0);
-  __m256d T2 = _mm256_shuffle_pd(P2, P3, 15);
-  __m256d T3 = _mm256_shuffle_pd(P2, P3, 0);
-
-  kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32));
-  kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49));
-  kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32));
-  kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49));
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2cd,2>& kernel) {
-  __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4));
-  kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4));
- kernel.packet[0].v = tmp;
-}
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_COMPLEX_AVX_H
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -1,564 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PACKET_MATH_AVX_H
-#define EIGEN_PACKET_MATH_AVX_H
-
-namespace Eigen {
-
-namespace internal {
-
-#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
-#endif
-
-#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
-#endif
-
-#ifdef EIGEN_VECTORIZE_FMA
-#ifndef EIGEN_HAS_FUSED_MADD
-#define EIGEN_HAS_FUSED_MADD 1
-#endif
-#endif
-
-typedef __m256  Packet8f;
-typedef __m256i Packet8i;
-typedef __m256d Packet4d;
-
-template<> struct is_arithmetic<__m256>  { enum { value = true }; };
-template<> struct is_arithmetic<__m256i> { enum { value = true }; };
-template<> struct is_arithmetic<__m256d> { enum { value = true }; };
-
-#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \
-  const Packet8f p8f_##NAME = pset1<Packet8f>(X)
-
-#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \
-  const Packet4d p4d_##NAME = pset1<Packet4d>(X)
-
-
-template<> struct packet_traits<float>  : default_packet_traits
-{
-  typedef Packet8f type;
-  typedef Packet4f half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size=8,
-    HasHalfPacket = 1,
-
-    HasDiv  = 1,
-    HasSin  = 0,
-    HasCos  = 0,
-    HasLog  = 0,
-    HasExp  = 0,
-    HasSqrt = 0
-  };
- };
-template<> struct packet_traits<double> : default_packet_traits
-{
-  typedef Packet4d type;
-  typedef Packet2d half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size=4,
-    HasHalfPacket = 1,
-
-    HasDiv  = 1,
-    HasExp  = 0
-  };
-};
-
-/* Proper support for integers is only provided by AVX2. In the meantime, we'll
-   use SSE instructions and packets to deal with integers.
-template<> struct packet_traits<int>    : default_packet_traits
-{
-  typedef Packet8i type;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size=8
-  };
-};
-*/
-
-template<> struct unpacket_traits<Packet8f> { typedef float  type; typedef Packet4f half; enum {size=8}; };
-template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4}; };
-template<> struct unpacket_traits<Packet8i> { typedef int    type; typedef Packet4i half; enum {size=8}; };
-
-template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float&  from) { return _mm256_set1_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int&    from) { return _mm256_set1_epi32(from); }
-
-template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float*  from) { return _mm256_broadcast_ss(from); }
-template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
-
-template<> EIGEN_STRONG_INLINE Packet8f plset<float>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
-template<> EIGEN_STRONG_INLINE Packet4d plset<double>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
-
-template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
-{
-  return _mm256_sub_ps(_mm256_set1_ps(0.0),a);
-}
-template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
-{
-  return _mm256_sub_pd(_mm256_set1_pd(0.0),a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); }
-
-
-template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/)
-{ eigen_assert(false && "packet integer division are not supported by AVX");
-  return pset1<Packet8i>(0);
-}
-
-#ifdef EIGEN_VECTORIZE_FMA
-template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
-#if defined(__clang__) || defined(__GNUC__)
-  // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
-  // and gcc stupidly generates a vfmadd132ps instruction,
-  // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate
-  // the result of the product.
-  Packet8f res = c;
-  __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
-  return res;
-#else
-  return _mm256_fmadd_ps(a,b,c);
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
-#if defined(__clang__) || defined(__GNUC__)
-  // see above
-  Packet4d res = c;
-  __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
-  return res;
-#else
-  return _mm256_fmadd_pd(a,b,c);
-#endif
-}
-#endif
-
-template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
-
-template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
-
-// Loads 4 floats from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3, a3}
-template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
-{
-  // TODO try to find a way to avoid the need of a temporary register
-//   Packet8f tmp  = _mm256_castps128_ps256(_mm_loadu_ps(from));
-//   tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
-//   return _mm256_unpacklo_ps(tmp,tmp);
-  
-  // _mm256_insertf128_ps is very slow on Haswell, thus:
-  Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
-  // mimic an "inplace" permutation of the lower 128bits using a blend
-  tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
-  // then we can perform a consistent permutation on the global register to get everything in shape:
-  return  _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
-}
-// Loads 2 doubles from memory a returns the packet {a0, a0  a1, a1}
-template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
-{
-  Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
-  return  _mm256_permute_pd(tmp, 3<<2);
-}
-
-// Loads 2 floats from memory a returns the packet {a0, a0  a0, a0, a1, a1, a1, a1}
-template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from)
-{
-  Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
-  return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1);
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
-
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
-
-// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
-// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
-template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, DenseIndex stride)
-{
-  return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
-                       from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
-}
-template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, DenseIndex stride)
-{
-  return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, DenseIndex stride)
-{
-  __m128 low = _mm256_extractf128_ps(from, 0);
-  to[stride*0] = _mm_cvtss_f32(low);
-  to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
-  to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
-  to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
-
-  __m128 high = _mm256_extractf128_ps(from, 1);
-  to[stride*4] = _mm_cvtss_f32(high);
-  to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
-  to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
-  to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, DenseIndex stride)
-{
-  __m128d low = _mm256_extractf128_pd(from, 0);
-  to[stride*0] = _mm_cvtsd_f64(low);
-  to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
-  __m128d high = _mm256_extractf128_pd(from, 1);
-  to[stride*2] = _mm_cvtsd_f64(high);
-  to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
-}
-
-template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a)
-{
-  Packet8f pa = pset1<Packet8f>(a);
-  pstore(to, pa);
-}
-template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a)
-{
-  Packet4d pa = pset1<Packet4d>(a);
-  pstore(to, pa);
-}
-template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
-{
-  Packet8i pa = pset1<Packet8i>(a);
-  pstore(to, pa);
-}
-
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet8f>(const Packet8f& a) {
-  return _mm_cvtss_f32(_mm256_castps256_ps128(a));
-}
-template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
-  return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
-}
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet8i>(const Packet8i& a) {
-  return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
-}
-
-
-template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a)
-{
-  __m256 tmp = _mm256_shuffle_ps(a,a,0x1b);
-  return _mm256_permute2f128_ps(tmp, tmp, 1);
-}
-template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
-{
-   __m256d tmp = _mm256_shuffle_pd(a,a,5);
-  return _mm256_permute2f128_pd(tmp, tmp, 1);
-
-  __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
-    return _mm256_permute_pd(swap_halves,5);
-}
-
-// pabs should be ok
-template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a)
-{
-  const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
-  return _mm256_and_ps(a,mask);
-}
-template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
-{
-  const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
-  return _mm256_and_pd(a,mask);
-}
-
-// preduxp should be ok
-// FIXME: why is this ok? why isn't the simply implementation working as expected?
-template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
-{
-    __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]);
-    __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]);
-    __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]);
-    __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]);
-
-    __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
-    __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
-    __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
-    __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
-
-    __m256 perm1 =  _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
-    __m256 perm2 =  _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
-    __m256 perm3 =  _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
-    __m256 perm4 =  _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
-
-    __m256 sum1 = _mm256_add_ps(perm1, hsum5);
-    __m256 sum2 = _mm256_add_ps(perm2, hsum6);
-    __m256 sum3 = _mm256_add_ps(perm3, hsum7);
-    __m256 sum4 = _mm256_add_ps(perm4, hsum8);
-
-    __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
-    __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
-
-    __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
-    return final;
-}
-template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs)
-{
- Packet4d tmp0, tmp1;
-
-  tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]);
-  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
-
-  tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]);
-  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
-
-  return _mm256_blend_pd(tmp0, tmp1, 0xC);
-}
-
-template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
-{
-  Packet8f tmp0 = _mm256_hadd_ps(a,_mm256_permute2f128_ps(a,a,1));
-  tmp0 = _mm256_hadd_ps(tmp0,tmp0);
-  return pfirst(_mm256_hadd_ps(tmp0, tmp0));
-}
-template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
-{
-  Packet4d tmp0 = _mm256_hadd_pd(a,_mm256_permute2f128_pd(a,a,1));
-  return pfirst(_mm256_hadd_pd(tmp0,tmp0));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f predux4<Packet8f>(const Packet8f& a)
-{
-  return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
-}
-
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a)
-{
-  Packet8f tmp;
-  tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1));
-  tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
-  return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
-}
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a)
-{
-  Packet4d tmp;
-  tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1));
-  return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1)));
-}
-
-template<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a)
-{
-  Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1));
-  tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
-  return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
-}
-template<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a)
-{
-  Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1));
-  return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
-}
-
-template<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a)
-{
-  Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1));
-  tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
-  return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
-}
-
-template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
-{
-  Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1));
-  return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
-}
-
-
-template<int Offset>
-struct palign_impl<Offset,Packet8f>
-{
-  static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second)
-  {
-    if (Offset==1)
-    {
-      first = _mm256_blend_ps(first, second, 1);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
-      first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0x88);
-    }
-    else if (Offset==2)
-    {
-      first = _mm256_blend_ps(first, second, 3);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
-      first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xcc);
-    }
-    else if (Offset==3)
-    {
-      first = _mm256_blend_ps(first, second, 7);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
-      first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xee);
-    }
-    else if (Offset==4)
-    {
-      first = _mm256_blend_ps(first, second, 15);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
-      first = _mm256_permute_ps(_mm256_permute2f128_ps (tmp, tmp, 1), _MM_SHUFFLE(3,2,1,0));
-    }
-    else if (Offset==5)
-    {
-      first = _mm256_blend_ps(first, second, 31);
-      first = _mm256_permute2f128_ps(first, first, 1);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
-      first = _mm256_permute2f128_ps(tmp, tmp, 1);
-      first = _mm256_blend_ps(tmp, first, 0x88);
-    }
-    else if (Offset==6)
-    {
-      first = _mm256_blend_ps(first, second, 63);
-      first = _mm256_permute2f128_ps(first, first, 1);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
-      first = _mm256_permute2f128_ps(tmp, tmp, 1);
-      first = _mm256_blend_ps(tmp, first, 0xcc);
-    }
-    else if (Offset==7)
-    {
-      first = _mm256_blend_ps(first, second, 127);
-      first = _mm256_permute2f128_ps(first, first, 1);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
-      first = _mm256_permute2f128_ps(tmp, tmp, 1);
-      first = _mm256_blend_ps(tmp, first, 0xee);
-    }
-  }
-};
-
-template<int Offset>
-struct palign_impl<Offset,Packet4d>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second)
-  {
-    if (Offset==1)
-    {
-      first = _mm256_blend_pd(first, second, 1);
-      __m256d tmp = _mm256_permute_pd(first, 5);
-      first = _mm256_permute2f128_pd(tmp, tmp, 1);
-      first = _mm256_blend_pd(tmp, first, 0xA);
-    }
-    else if (Offset==2)
-    {
-      first = _mm256_blend_pd(first, second, 3);
-      first = _mm256_permute2f128_pd(first, first, 1);
-    }
-    else if (Offset==3)
-    {
-      first = _mm256_blend_pd(first, second, 7);
-      __m256d tmp = _mm256_permute_pd(first, 5);
-      first = _mm256_permute2f128_pd(tmp, tmp, 1);
-      first = _mm256_blend_pd(tmp, first, 5);
-    }
-  }
-};
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8f,8>& kernel) {
-  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
-  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
-  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
-  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
-  __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
-  __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
-  __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
-  __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
-  __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
-  __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
-  __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
-  __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
-  __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0));
-  __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2));
-  __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0));
-  __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2));
-  kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20);
-  kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20);
-  kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20);
-  kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20);
-  kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31);
-  kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31);
-  kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31);
-  kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8f,4>& kernel) {
-  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
-  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
-  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
-  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
-
-  __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
-  __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
-  __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
-  __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
-
-  kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20);
-  kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20);
-  kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31);
-  kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31);
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4d,4>& kernel) {
-  __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
-  __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
-  __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);
-  __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
-
-  kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32);
-  kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49);
-  kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32);
-  kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
-}
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_PACKET_MATH_AVX_H
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -16,14 +16,11 @@ namespace internal {

 static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_ZERO_);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
 static Packet16uc p16uc_COMPLEX_RE   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
-static Packet16uc p16uc_COMPLEX_IM   = vec_sld(p16uc_DUPLICATE, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+static Packet16uc p16uc_COMPLEX_IM   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
 static Packet16uc p16uc_COMPLEX_REV  = vec_sld(p16uc_REVERSE, p16uc_REVERSE, 8);//{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
 static Packet16uc p16uc_COMPLEX_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
-static Packet16uc p16uc_PSET_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_COMPLEX_RE, (Packet4ui)p16uc_COMPLEX_IM);//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
-static Packet16uc p16uc_PSET_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_COMPLEX_RE, (Packet4ui)p16uc_COMPLEX_IM);//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
-static Packet16uc p16uc_COMPLEX_MASK16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);//{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
-static Packet16uc p16uc_COMPLEX_TRANSPOSE_0 = vec_add(p16uc_PSET_HI, p16uc_COMPLEX_MASK16);//{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
-static Packet16uc p16uc_COMPLEX_TRANSPOSE_1 = vec_add(p16uc_PSET_LO, p16uc_COMPLEX_MASK16);//{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
+static Packet16uc p16uc_PSET_HI = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 1));//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_PSET_LO = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 2), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 3));//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };

 //---------- float ----------
 struct Packet2cf
@@ -36,7 +33,6 @@ struct Packet2cf
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
  typedef Packet2cf type;
-  typedef Packet2cf half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
@@ -55,7 +51,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
  };
 };

-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };

 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
@@ -69,22 +65,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo
  return res;
 }

-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, DenseIndex stride)
-{
-  std::complex<float> EIGEN_ALIGN16 af[2];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
-  return Packet2cf(vec_ld(0, (const float*)af));
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, DenseIndex stride)
-{
-  std::complex<float> EIGEN_ALIGN16 af[2];
-  vec_st(from.v, 0, (float*)af);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
-}
-
-
 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
@@ -230,13 +210,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x
  return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX_REV));
 }

-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
-{
-  Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_0);
-  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_1);
-  kernel.packet[0].v = tmp;
-}
-
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2014 Konstantinos Margaritis <markos@freevec.org>
+// Copyright (C) 2008 Konstantinos Margaritis <markos@codex.gr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -18,10 +18,6 @@ namespace internal {
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
 #endif

-#ifndef EIGEN_HAS_FUSED_MADD
-#define EIGEN_HAS_FUSED_MADD 1
-#endif
-
 #ifndef EIGEN_HAS_FUSE_CJMADD
 #define EIGEN_HAS_FUSE_CJMADD 1
 #endif
@@ -60,32 +56,29 @@ typedef __vector unsigned char  Packet16uc;
 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))

 // Define global static constants:
-static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
-static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
-static Packet16uc p16uc_REVERSE = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
-static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); //{ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}
-static Packet16uc p16uc_DUPLICATE = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7};
+static Packet4f p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 };
+static Packet4i p4i_COUNTDOWN = { 3, 2, 1, 0 };
+static Packet16uc p16uc_REVERSE = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
+static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
+static Packet16uc p16uc_DUPLICATE = {0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7};

-static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
-static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
-static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
+static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1);
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16);
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1);
+static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0);
+static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1);

 template<> struct packet_traits<float>  : default_packet_traits
 {
  typedef Packet4f type;
-  typedef Packet4f half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size=4,
-    HasHalfPacket=0,

    // FIXME check the Has*
-    HasDiv  = 1,
    HasSin  = 0,
    HasCos  = 0,
    HasLog  = 0,
@@ -96,7 +89,6 @@ template<> struct packet_traits<float>  : default_packet_traits
 template<> struct packet_traits<int>    : default_packet_traits
 {
  typedef Packet4i type;
-  typedef Packet4i half;
  enum {
    // FIXME check the Has*
    Vectorizable = 1,
@@ -105,8 +97,8 @@ template<> struct packet_traits<int>    : default_packet_traits
  };
 };

-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
 /*
 inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
 {
@@ -152,7 +144,6 @@ inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
  return s;
 }
 */
-
 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
  float EIGEN_ALIGN16 af[4];
@@ -170,65 +161,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
  return vc;
 }

-
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4f>(const float *a,
-                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
-{
-  a3 = vec_ld(0,a);
-  a0 = vec_splat(a3, 0);
-  a1 = vec_splat(a3, 1);
-  a2 = vec_splat(a3, 2);
-  a3 = vec_splat(a3, 3);
-}
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4i>(const int *a,
-                      Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
-{
-  a3 = vec_ld(0,a);
-  a0 = vec_splat(a3, 0);
-  a1 = vec_splat(a3, 1);
-  a2 = vec_splat(a3, 2);
-  a3 = vec_splat(a3, 3);
-}
-
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, DenseIndex stride)
-{
-  float EIGEN_ALIGN16 af[4];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
-  af[2] = from[2*stride];
-  af[3] = from[3*stride];
- return vec_ld(0, af);
-}
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, DenseIndex stride)
-{
-  int EIGEN_ALIGN16 ai[4];
-  ai[0] = from[0*stride];
-  ai[1] = from[1*stride];
-  ai[2] = from[2*stride];
-  ai[3] = from[3*stride];
- return vec_ld(0, ai);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, DenseIndex stride)
-{
-  float EIGEN_ALIGN16 af[4];
-  vec_st(from, 0, af);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
-  to[2*stride] = af[2];
-  to[3*stride] = af[3];
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, DenseIndex stride)
-{
-  int EIGEN_ALIGN16 ai[4];
-  vec_st(from, 0, ai);
-  to[0*stride] = ai[0];
-  to[1*stride] = ai[1];
-  to[2*stride] = ai[2];
-  to[3*stride] = ai[3];
-}
-
 template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
 template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a)     { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }

@@ -354,15 +286,15 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
 {
  Packet4f p;
-  if((ptrdiff_t(from) % 16) == 0)  p = pload<Packet4f>(from);
-  else                             p = ploadu<Packet4f>(from);
+  if((ptrdiff_t(&from) % 16) == 0)  p = pload<Packet4f>(from);
+  else                              p = ploadu<Packet4f>(from);
  return vec_perm(p, p, p16uc_DUPLICATE);
 }
 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
 {
  Packet4i p;
-  if((ptrdiff_t(from) % 16) == 0)  p = pload<Packet4i>(from);
-  else                             p = ploadu<Packet4i>(from);
+  if((ptrdiff_t(&from) % 16) == 0)  p = pload<Packet4i>(from);
+  else                              p = ploadu<Packet4i>(from);
  return vec_perm(p, p, p16uc_DUPLICATE);
 }

@@ -562,32 +494,6 @@ struct palign_impl<Offset,Packet4i>
  }
 };

-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4f,4>& kernel) {
-  Packet4f t0, t1, t2, t3;
-  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
-  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
-  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
-  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
-  kernel.packet[0] = vec_mergeh(t0, t2);
-  kernel.packet[1] = vec_mergel(t0, t2);
-  kernel.packet[2] = vec_mergeh(t1, t3);
-  kernel.packet[3] = vec_mergel(t1, t3);
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4i,4>& kernel) {
-  Packet4i t0, t1, t2, t3;
-  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
-  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
-  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
-  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
-  kernel.packet[0] = vec_mergeh(t0, t2);
-  kernel.packet[1] = vec_mergel(t0, t2);
-  kernel.packet[2] = vec_mergeh(t1, t3);
-  kernel.packet[3] = vec_mergel(t1, t3);
-}
-
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/arch/CMakeLists.txt
+++ b/Eigen/src/Core/arch/CMakeLists.txt
@@ -1,5 +1,4 @@
 ADD_SUBDIRECTORY(SSE)
 ADD_SUBDIRECTORY(AltiVec)
 ADD_SUBDIRECTORY(NEON)
-ADD_SUBDIRECTORY(AVX)
 ADD_SUBDIRECTORY(Default)
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -28,7 +28,6 @@ struct Packet2cf
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
  typedef Packet2cf type;
-  typedef Packet2cf half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
@@ -47,7 +46,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
  };
 };

-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };

 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
@@ -111,22 +110,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }

-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, DenseIndex stride)
-{
-  Packet4f res;
-  res = vsetq_lane_f32(std::real(from[0*stride]), res, 0);
-  res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1);
-  res = vsetq_lane_f32(std::real(from[1*stride]), res, 2);
-  res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3);
-  return Packet2cf(res);
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, DenseIndex stride)
-{
-  to[stride*0] = std::complex<float>(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1));
-  to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
-}
-
 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { EIGEN_ARM_PREFETCH((float *)addr); }

 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
@@ -263,14 +246,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
  return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
 }

-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2cf,2>& kernel) {
-  float32x4_t tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v));
-  kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v));
-  kernel.packet[1].v = tmp;
-}
-
-
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -49,7 +49,6 @@ typedef uint32x4_t  Packet4ui;
  #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W}
 #endif

-
 // arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
 // which available on LLVM and GCC (at least)
 #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || defined(__GNUC__)
@@ -66,7 +65,6 @@ typedef uint32x4_t  Packet4ui;
 template<> struct packet_traits<float>  : default_packet_traits
 {
  typedef Packet4f type;
-  typedef Packet4f half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
@@ -84,7 +82,6 @@ template<> struct packet_traits<float>  : default_packet_traits
 template<> struct packet_traits<int>    : default_packet_traits
 {
  typedef Packet4i type;
-  typedef Packet4i half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
@@ -97,13 +94,12 @@ template<> struct packet_traits<int>    : default_packet_traits
 // workaround gcc 4.2, 4.3 and 4.4 compilatin issue
 EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
 EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); }
-EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); }
 EIGEN_STRONG_INLINE void        vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
 EIGEN_STRONG_INLINE void        vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
 #endif

-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };

 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return vdupq_n_f32(from); }
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   { return vdupq_n_s32(from); }
@@ -222,40 +218,6 @@ template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& f
 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }

-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, DenseIndex stride)
-{
-  Packet4f res;
-  res = vsetq_lane_f32(from[0*stride], res, 0);
-  res = vsetq_lane_f32(from[1*stride], res, 1);
-  res = vsetq_lane_f32(from[2*stride], res, 2);
-  res = vsetq_lane_f32(from[3*stride], res, 3);
-  return res;
-}
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, DenseIndex stride)
-{
-  Packet4i res;
-  res = vsetq_lane_s32(from[0*stride], res, 0);
-  res = vsetq_lane_s32(from[1*stride], res, 1);
-  res = vsetq_lane_s32(from[2*stride], res, 2);
-  res = vsetq_lane_s32(from[3*stride], res, 3);
-  return res;
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, DenseIndex stride)
-{
-  to[stride*0] = vgetq_lane_f32(from, 0);
-  to[stride*1] = vgetq_lane_f32(from, 1);
-  to[stride*2] = vgetq_lane_f32(from, 2);
-  to[stride*3] = vgetq_lane_f32(from, 3);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, DenseIndex stride)
-{
-  to[stride*0] = vgetq_lane_s32(from, 0);
-  to[stride*1] = vgetq_lane_s32(from, 1);
-  to[stride*2] = vgetq_lane_s32(from, 2);
-  to[stride*3] = vgetq_lane_s32(from, 3);
-}
-
 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { EIGEN_ARM_PREFETCH(addr); }

@@ -448,30 +410,9 @@ PALIGN_NEON(0,Packet4i,vextq_s32)
 PALIGN_NEON(1,Packet4i,vextq_s32)
 PALIGN_NEON(2,Packet4i,vextq_s32)
 PALIGN_NEON(3,Packet4i,vextq_s32)
-
+    
 #undef PALIGN_NEON

-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4f,4>& kernel) {
-  float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]);
-  float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]);
-
-  kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0]));
-  kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0]));
-  kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1]));
-  kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1]));
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4i,4>& kernel) {
-  int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]);
-  int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]);
-  kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0]));
-  kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0]));
-  kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1]));
-  kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
-}
-
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -22,18 +22,13 @@ struct Packet2cf
  __m128  v;
 };

-// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
-// to leverage AVX instructions.
-#ifndef EIGEN_VECTORIZE_AVX
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
  typedef Packet2cf type;
-  typedef Packet2cf half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size = 2,
-    HasHalfPacket = 0,

    HasAdd    = 1,
    HasSub    = 1,
@@ -47,9 +42,8 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
    HasSetLinear = 0
  };
 };
-#endif

-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };

 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
@@ -110,23 +104,8 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo

 template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }

-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); }
-
-
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, DenseIndex stride)
-{
-  return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]),
-                              std::imag(from[0*stride]), std::real(from[0*stride])));
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, DenseIndex stride)
-{
-  to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)),
-                                     _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1)));
-  to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)),
-                                     _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
-}
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }

 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }

@@ -145,7 +124,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Pack
  #endif
 }

-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); }
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(_mm_castps_pd(a.v)))); }

 template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
 {
@@ -235,7 +214,7 @@ template<> struct conj_helper<Packet4f, Packet2cf, false,false>
  { return padd(c, pmul(x,y)); }

  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
-  { return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
+  { return Packet2cf(Eigen::internal::pmul(x, y.v)); }
 };

 template<> struct conj_helper<Packet2cf, Packet4f, false,false>
@@ -244,7 +223,7 @@ template<> struct conj_helper<Packet2cf, Packet4f, false,false>
  { return padd(c, pmul(x,y)); }

  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
-  { return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
+  { return Packet2cf(Eigen::internal::pmul(x.v, y)); }
 };

 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
@@ -269,18 +248,13 @@ struct Packet1cd
  __m128d  v;
 };

-// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
-// to leverage AVX instructions.
-#ifndef EIGEN_VECTORIZE_AVX
 template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 {
  typedef Packet1cd type;
-  typedef Packet1cd half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 0,
    size = 1,
-    HasHalfPacket = 0,

    HasAdd    = 1,
    HasSub    = 1,
@@ -294,13 +268,12 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
    HasSetLinear = 0
  };
 };
-#endif

-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; };

 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
 {
  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
@@ -338,8 +311,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<dou
 template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }

 // FIXME force unaligned store, this is a temporary fix
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }

 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }

@@ -437,7 +410,7 @@ template<> struct conj_helper<Packet2d, Packet1cd, false,false>
  { return padd(c, pmul(x,y)); }

  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
-  { return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
+  { return Packet1cd(Eigen::internal::pmul(x, y.v)); }
 };

 template<> struct conj_helper<Packet1cd, Packet2d, false,false>
@@ -446,7 +419,7 @@ template<> struct conj_helper<Packet1cd, Packet2d, false,false>
  { return padd(c, pmul(x,y)); }

  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
-  { return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
+  { return Packet1cd(Eigen::internal::pmul(x.v, y)); }
 };

 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
@@ -459,17 +432,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, con

 EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
 {
-  return Packet1cd(preverse(Packet2d(x.v)));
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2cf,2>& kernel) {
-  __m128d w1 = _mm_castps_pd(kernel.packet[0].v);
-  __m128d w2 = _mm_castps_pd(kernel.packet[1].v);
-
-  __m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2));
-  kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2));
-  kernel.packet[1].v = tmp;
+  return Packet1cd(preverse(x.v));
 }

 } // end namespace internal
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -52,7 +52,7 @@ Packet4f plog<Packet4f>(const Packet4f& _x)

  Packet4i emm0;

-  Packet4f invalid_mask = _mm_cmplt_ps(x, _mm_setzero_ps());
+  Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); // not greater equal is true if x is NaN
  Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps());

  x = pmax(x, p4f_min_norm_pos);  /* cut off denormalized stuff */
@@ -63,7 +63,7 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
  x = _mm_or_ps(x, p4f_half);

  emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
-  Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1);
+  Packet4f e = padd(_mm_cvtepi32_ps(emm0), p4f_1);

  /* part2:
     if( x < SQRTHF ) {
@@ -72,9 +72,9 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
     } else { x = x - 1.0; }
  */
  Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
-  Packet4f tmp = pand(x, mask);
+  Packet4f tmp = _mm_and_ps(x, mask);
  x = psub(x, p4f_1);
-  e = psub(e, pand(p4f_1, mask));
+  e = psub(e, _mm_and_ps(p4f_1, mask));
  x = padd(x, tmp);

  Packet4f x2 = pmul(x,x);
@@ -138,7 +138,6 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
 #ifdef EIGEN_VECTORIZE_SSE4_1
  fx = _mm_floor_ps(fx);
 #else
-  tmp = _mm_setzero_ps();
  emm0 = _mm_cvttps_epi32(fx);
  tmp  = _mm_cvtepi32_ps(emm0);
  /* if greater, substract 1 */
@@ -167,7 +166,7 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
  emm0 = _mm_cvttps_epi32(fx);
  emm0 = _mm_add_epi32(emm0, p4i_0x7f);
  emm0 = _mm_slli_epi32(emm0, 23);
-  return pmul(y, Packet4f(_mm_castsi128_ps(emm0)));
+  return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x);
 }
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d pexp<Packet2d>(const Packet2d& _x)
@@ -207,7 +206,6 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
 #ifdef EIGEN_VECTORIZE_SSE4_1
  fx = _mm_floor_pd(fx);
 #else
-  tmp = _mm_setzero_pd();
  emm0 = _mm_cvttpd_epi32(fx);
  tmp  = _mm_cvtepi32_pd(emm0);
  /* if greater, substract 1 */
@@ -241,7 +239,7 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
  emm0 = _mm_add_epi32(emm0, p4i_1023_0);
  emm0 = _mm_slli_epi32(emm0, 20);
  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
-  return pmul(x, Packet2d(_mm_castsi128_pd(emm0)));
+  return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
 }

 /* evaluation of 4 sines at onces, using SSE2 intrinsics.
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -22,41 +22,9 @@ namespace internal {
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
 #endif

-#ifdef EIGEN_VECTORIZE_FMA
-#ifndef EIGEN_HAS_FUSED_MADD
-#define EIGEN_HAS_FUSED_MADD 1
-#endif
-#endif
-
-#if defined EIGEN_VECTORIZE_AVX && defined __GNUC__ && !(defined __clang__ || defined __INTEL_COMPILER)
-// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
-// have overloads for both types without linking error.
-// One solution is to increase ABI version using -fabi-version=4 (or greater).
-// To workaround this inconvenince, we rather wrap 128bit types into the following helper
-// structure:
-// TODO disable this wrapper if abi-versio>=4, but to detect that without asking the user to define a macro?
-template<typename T>
-struct eigen_packet_wrapper
-{
-  EIGEN_ALWAYS_INLINE operator T&() { return m_val; }
-  EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; }
-  EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {}
-  EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {}
-  EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) {
-    m_val = v;
-    return *this;
-  }
-  
-  T m_val;
-};
-typedef eigen_packet_wrapper<__m128>  Packet4f;
-typedef eigen_packet_wrapper<__m128i> Packet4i;
-typedef eigen_packet_wrapper<__m128d> Packet2d;
-#else
 typedef __m128  Packet4f;
 typedef __m128i Packet4i;
 typedef __m128d Packet2d;
-#endif

 template<> struct is_arithmetic<__m128>  { enum { value = true }; };
 template<> struct is_arithmetic<__m128i> { enum { value = true }; };
@@ -90,18 +58,13 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; };
  const Packet4i p4i_##NAME = pset1<Packet4i>(X)


-// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
-// to leverage AVX instructions.
-#ifndef EIGEN_VECTORIZE_AVX
 template<> struct packet_traits<float>  : default_packet_traits
 {
  typedef Packet4f type;
-  typedef Packet4f half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size=4,
-    HasHalfPacket = 0,

    HasDiv  = 1,
    HasSin  = EIGEN_FAST_MATH,
@@ -114,23 +77,19 @@ template<> struct packet_traits<float>  : default_packet_traits
 template<> struct packet_traits<double> : default_packet_traits
 {
  typedef Packet2d type;
-  typedef Packet2d half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size=2,
-    HasHalfPacket = 0,

    HasDiv  = 1,
    HasExp  = 1,
    HasSqrt = 1
  };
 };
-#endif
 template<> struct packet_traits<int>    : default_packet_traits
 {
  typedef Packet4i type;
-  typedef Packet4i half;
  enum {
    // FIXME check the Has*
    Vectorizable = 1,
@@ -139,9 +98,9 @@ template<> struct packet_traits<int>    : default_packet_traits
  };
 };

-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };

 #if defined(_MSC_VER) && (_MSC_VER==1500)
 // Workaround MSVC 9 internal compiler error.
@@ -151,26 +110,13 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { re
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set_epi32(from,from,from,from); }
 #else
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set_ps1(from); }
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set1_ps(from); }
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set1_epi32(from); }
 #endif

-// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
-// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
-// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
-// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
-// Also note that with AVX, we want it to generate a vbroadcastss.
-#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__)
-template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
-  return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
-}
-#endif
-  
-#ifndef EIGEN_VECTORIZE_AVX
 template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
 template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
-#endif
 template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }

 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
@@ -193,7 +139,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
 }
 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
 {
-  return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
+  return psub(_mm_setr_epi32(0,0,0,0), a);
 }

 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
@@ -227,10 +173,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co

 // for some weird raisons, it has to be overloaded for packet of integers
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
-#ifdef EIGEN_VECTORIZE_FMA
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
-#endif

 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
@@ -276,7 +218,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, con

 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }

 #if defined(_MSC_VER)
  template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*  from) {
@@ -294,7 +236,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { E
    #endif
  }
  template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
-  template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*    from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); }
+  template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*    from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
 #else
 // Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
 // require pointer casting to incompatible pointer types and leads to invalid code
@@ -303,17 +245,14 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { E
 // TODO: do the same for MSVC (ICC is compatible)
 // NOTE: with the code below, MSVC's compiler crashes!

-#if defined(__GNUC__) && (defined(__i386__) || (defined(__x86_64) && EIGEN_GNUC_AT_LEAST(4, 8)))
+#if defined(__GNUC__) && defined(__i386__)
  // bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd
  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
-  #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 1
 #elif defined(__clang__)
  // bug 201: Segfaults in __mm_loadh_pd with clang 2.8
  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
-  #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0
 #else
  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0
-  #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0
 #endif

 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
@@ -344,7 +283,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 {
  EIGEN_DEBUG_UNALIGNED_LOAD
 #if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
-  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+  return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from));
 #else
  __m128d res;
  res =  _mm_load_sd((const double*)(from)) ;
@@ -363,77 +302,38 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*  from)
 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
 {
  Packet4i tmp;
-  tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
+  tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from));
  return vec4i_swizzle1(tmp, 0, 0, 1, 1);
 }

 template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); }

 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
  EIGEN_DEBUG_UNALIGNED_STORE
-#if EIGEN_AVOID_CUSTOM_UNALIGNED_STORES
-  _mm_storeu_pd(to, from);
-#else
  _mm_storel_pd((to), from);
  _mm_storeh_pd((to+1), from);
-#endif
-}
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castps_pd(from))); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castsi128_pd(from))); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, DenseIndex stride)
-{
- return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
-}
-template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, DenseIndex stride)
-{
- return _mm_set_pd(from[1*stride], from[0*stride]);
-}
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, DenseIndex stride)
-{
- return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
- }
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, DenseIndex stride)
-{
-  to[stride*0] = _mm_cvtss_f32(from);
-  to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
-  to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
-  to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, DenseIndex stride)
-{
-  to[stride*0] = _mm_cvtsd_f64(from);
-  to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, DenseIndex stride)
-{
-  to[stride*0] = _mm_cvtsi128_si32(from);
-  to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
-  to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
-  to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
 }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castps_pd(from)); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castsi128_pd(from)); }

 // some compilers might be tempted to perform multiple moves instead of using a vector path.
 template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
 {
  Packet4f pa = _mm_set_ss(a);
-  pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0)));
+  pstore(to, vec4f_swizzle1(pa,0,0,0,0));
 }
 // some compilers might be tempted to perform multiple moves instead of using a vector path.
 template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
 {
  Packet2d pa = _mm_set_sd(a);
-  pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
+  pstore(to, vec2d_swizzle1(pa,0,0));
 }

-#ifndef EIGEN_VECTORIZE_AVX
 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-#endif

 #if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER)
 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
@@ -480,38 +380,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
  #endif
 }

-// with AVX, the default implementations based on pload1 are faster
-#ifndef __AVX__
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4f>(const float *a,
-                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
-{
-  a3 = pload<Packet4f>(a);
-  a0 = vec4f_swizzle1(a3, 0,0,0,0);
-  a1 = vec4f_swizzle1(a3, 1,1,1,1);
-  a2 = vec4f_swizzle1(a3, 2,2,2,2);
-  a3 = vec4f_swizzle1(a3, 3,3,3,3);
-}
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet2d>(const double *a,
-                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
-{
-#ifdef EIGEN_VECTORIZE_SSE3
-  a0 = _mm_loaddup_pd(a+0);
-  a1 = _mm_loaddup_pd(a+1);
-  a2 = _mm_loaddup_pd(a+2);
-  a3 = _mm_loaddup_pd(a+3);
-#else
-  a1 = pload<Packet2d>(a);
-  a0 = vec2d_swizzle1(a1, 0,0);
-  a1 = vec2d_swizzle1(a1, 1,1);
-  a3 = pload<Packet2d>(a+2);
-  a2 = vec2d_swizzle1(a3, 0,0);
-  a3 = vec2d_swizzle1(a3, 1,1);
-#endif
-}
-#endif
-
 EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
 {
  vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
@@ -539,10 +407,10 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
  Packet4f tmp0 = _mm_hadd_ps(a,a);
-  return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0));
+  return pfirst(_mm_hadd_ps(tmp0, tmp0));
 }

-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst<Packet2d>(_mm_hadd_pd(a, a)); }
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); }

 // SSSE3 version:
 // EIGEN_STRONG_INLINE float predux(const Packet4i& a)
@@ -585,7 +453,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
  Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
-  return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
+  return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1));
 }

 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
@@ -608,11 +476,11 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
 {
  Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
-  return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+  return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
 {
-  return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
+  return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
 }
 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
 {
@@ -628,18 +496,14 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
 {
  Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
-  return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+  return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
 {
-  return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
+  return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
 }
 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
 {
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
-  return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
-#else
  // after some experiments, it is seems this is the fastest way to implement it
  // for GCC (eg., it does not like using std::min after the pstore !!)
  EIGEN_ALIGN16 int aux[4];
@@ -647,25 +511,20 @@ template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
  int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
  int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
  return aux0<aux2 ? aux0 : aux2;
-#endif // EIGEN_VECTORIZE_SSE4_1
 }

 // max
 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
 {
  Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
-  return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+  return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
 {
-  return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
+  return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
 }
 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
 {
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
-  return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
-#else
  // after some experiments, it is seems this is the fastest way to implement it
  // for GCC (eg., it does not like using std::min after the pstore !!)
  EIGEN_ALIGN16 int aux[4];
@@ -673,7 +532,6 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
  int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
  int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
  return aux0>aux2 ? aux0 : aux2;
-#endif // EIGEN_VECTORIZE_SSE4_1
 }

 #if (defined __GNUC__)
@@ -784,31 +642,6 @@ struct palign_impl<Offset,Packet2d>
 };
 #endif

-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4f,4>& kernel) {
-  _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2d,2>& kernel) {
-  __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
-  kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
-  kernel.packet[1] = tmp;
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4i,4>& kernel) {
-  __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
-  __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
-  __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
-  __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
-
-  kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
-  kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
-  kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
-  kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
-}
-
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -1,167 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_ASSIGNMENT_FUNCTORS_H
-#define EIGEN_ASSIGNMENT_FUNCTORS_H
-
-namespace Eigen {
-
-namespace internal {
-  
-/** \internal
-  * \brief Template functor for scalar/packet assignment
-  *
-  */
-template<typename Scalar> struct assign_op {
-
-  EIGEN_EMPTY_STRUCT_CTOR(assign_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a = b; }
-  
-  template<int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
-  { internal::pstoret<Scalar,Packet,Alignment>(a,b); }
-};
-template<typename Scalar>
-struct functor_traits<assign_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::ReadCost,
-    PacketAccess = packet_traits<Scalar>::IsVectorized
-  };
-};
-
-/** \internal
-  * \brief Template functor for scalar/packet assignment with addition
-  *
-  */
-template<typename Scalar> struct add_assign_op {
-
-  EIGEN_EMPTY_STRUCT_CTOR(add_assign_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a += b; }
-  
-  template<int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
-  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::padd(internal::ploadt<Packet,Alignment>(a),b)); }
-};
-template<typename Scalar>
-struct functor_traits<add_assign_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasAdd
-  };
-};
-
-/** \internal
-  * \brief Template functor for scalar/packet assignment with subtraction
-  *
-  */
-template<typename Scalar> struct sub_assign_op {
-
-  EIGEN_EMPTY_STRUCT_CTOR(sub_assign_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a -= b; }
-  
-  template<int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
-  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::psub(internal::ploadt<Packet,Alignment>(a),b)); }
-};
-template<typename Scalar>
-struct functor_traits<sub_assign_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasAdd
-  };
-};
-
-/** \internal
-  * \brief Template functor for scalar/packet assignment with multiplication
-  *
-  */
-template<typename Scalar> struct mul_assign_op {
-
-  EIGEN_EMPTY_STRUCT_CTOR(mul_assign_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a *= b; }
-  
-  template<int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
-  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::pmul(internal::ploadt<Packet,Alignment>(a),b)); }
-};
-template<typename Scalar>
-struct functor_traits<mul_assign_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasMul
-  };
-};
-
-/** \internal
-  * \brief Template functor for scalar/packet assignment with diviving
-  *
-  */
-template<typename Scalar> struct div_assign_op {
-
-  EIGEN_EMPTY_STRUCT_CTOR(div_assign_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a /= b; }
-  
-  template<int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
-  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::pdiv(internal::ploadt<Packet,Alignment>(a),b)); }
-};
-template<typename Scalar>
-struct functor_traits<div_assign_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasMul
-  };
-};
-
-
-/** \internal
-  * \brief Template functor for scalar/packet assignment with swaping
-  *
-  * It works as follow. For a non-vectorized evaluation loop, we have:
-  *   for(i) func(A.coeffRef(i), B.coeff(i));
-  * where B is a SwapWrapper expression. The trick is to make SwapWrapper::coeff behaves like a non-const coeffRef.
-  * Actually, SwapWrapper might not even be needed since even if B is a plain expression, since it has to be writable
-  * B.coeff already returns a const reference to the underlying scalar value.
-  * 
-  * The case of a vectorized loop is more tricky:
-  *   for(i,j) func.assignPacket<A_Align>(&A.coeffRef(i,j), B.packet<B_Align>(i,j));
-  * Here, B must be a SwapWrapper whose packet function actually returns a proxy object holding a Scalar*,
-  * the actual alignment and Packet type.
-  *
-  */
-template<typename Scalar> struct swap_assign_op {
-
-  EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const
-  {
-    using std::swap;
-    swap(a,const_cast<Scalar&>(b));
-  }
-  
-  template<int LhsAlignment, int RhsAlignment, typename Packet>
-  EIGEN_STRONG_INLINE void swapPacket(Scalar* a, Scalar* b) const
-  {
-    Packet tmp = internal::ploadt<Packet,RhsAlignment>(b);
-    internal::pstoret<Scalar,Packet,RhsAlignment>(b, internal::ploadt<Packet,LhsAlignment>(a));
-    internal::pstoret<Scalar,Packet,LhsAlignment>(a, tmp);
-  }
-};
-template<typename Scalar>
-struct functor_traits<swap_assign_op<Scalar> > {
-  enum {
-    Cost = 3 * NumTraits<Scalar>::ReadCost,
-    PacketAccess = packet_traits<Scalar>::IsVectorized
-  };
-};
-
-} // namespace internal
-
-} // namespace Eigen
-
-#endif // EIGEN_ASSIGNMENT_FUNCTORS_H
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -1,456 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_BINARY_FUNCTORS_H
-#define EIGEN_BINARY_FUNCTORS_H
-
-namespace Eigen {
-
-namespace internal {
-
-//---------- associative binary functors ----------
-
-/** \internal
-  * \brief Template functor to compute the sum of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::operator+, class VectorwiseOp, DenseBase::sum()
-  */
-template<typename Scalar> struct scalar_sum_op {
-//   typedef Scalar result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a + b; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::padd(a,b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
-  { return internal::predux(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_sum_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasAdd
-  };
-};
-
-/** \internal
-  * \brief Template specialization to deprecate the summation of boolean expressions.
-  * This is required to solve Bug 426.
-  * \sa DenseBase::count(), DenseBase::any(), ArrayBase::cast(), MatrixBase::cast()
-  */
-template<> struct scalar_sum_op<bool> : scalar_sum_op<int> {
-  EIGEN_DEPRECATED
-  scalar_sum_op() {}
-};
-
-
-/** \internal
-  * \brief Template functor to compute the product of two scalars
-  *
-  * \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux()
-  */
-template<typename LhsScalar,typename RhsScalar> struct scalar_product_op {
-  enum {
-    // TODO vectorize mixed product
-    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
-  };
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmul(a,b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
-  { return internal::predux_mul(a); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost)/2, // rough estimate!
-    PacketAccess = scalar_product_op<LhsScalar,RhsScalar>::Vectorizable
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the conjugate product of two scalars
-  *
-  * This is a short cut for conj(x) * y which is needed for optimization purpose; in Eigen2 support mode, this becomes x * conj(y)
-  */
-template<typename LhsScalar,typename RhsScalar> struct scalar_conj_product_op {
-
-  enum {
-    Conj = NumTraits<LhsScalar>::IsComplex
-  };
-  
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
-  
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
-  { return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
-  
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_conj_product_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = NumTraits<LhsScalar>::MulCost,
-    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMul
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the min of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff()
-  */
-template<typename Scalar> struct scalar_min_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { EIGEN_USING_STD_MATH(min); return (min)(a, b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmin(a,b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
-  { return internal::predux_min(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_min_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasMin
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the max of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff()
-  */
-template<typename Scalar> struct scalar_max_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { EIGEN_USING_STD_MATH(max); return (max)(a, b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmax(a,b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
-  { return internal::predux_max(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_max_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasMax
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the hypot of two scalars
-  *
-  * \sa MatrixBase::stableNorm(), class Redux
-  */
-template<typename Scalar> struct scalar_hypot_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
-//   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const
-  {
-    EIGEN_USING_STD_MATH(max);
-    EIGEN_USING_STD_MATH(min);
-    using std::sqrt;
-    Scalar p, qp;
-    if(_x>_y)
-    {
-      p = _x;
-      qp = _y / p;
-    }
-    else
-    {
-      p = _y;
-      qp = _x / p;
-    }
-    return p * sqrt(Scalar(1) + qp*qp);
-  }
-};
-template<typename Scalar>
-struct functor_traits<scalar_hypot_op<Scalar> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess=0 };
-};
-
-/** \internal
-  * \brief Template functor to compute the pow of two scalars
-  */
-template<typename Scalar, typename OtherScalar> struct scalar_binary_pow_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_binary_pow_op)
-  EIGEN_DEVICE_FUNC
-  inline Scalar operator() (const Scalar& a, const OtherScalar& b) const { return numext::pow(a, b); }
-};
-template<typename Scalar, typename OtherScalar>
-struct functor_traits<scalar_binary_pow_op<Scalar,OtherScalar> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
-};
-
-
-
-//---------- non associative binary functors ----------
-
-/** \internal
-  * \brief Template functor to compute the difference of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::operator-
-  */
-template<typename Scalar> struct scalar_difference_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a - b; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::psub(a,b); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_difference_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasSub
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the quotient of two scalars
-  *
-  * \sa class CwiseBinaryOp, Cwise::operator/()
-  */
-template<typename LhsScalar,typename RhsScalar> struct scalar_quotient_op {
-  enum {
-    // TODO vectorize mixed product
-    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasDiv && packet_traits<RhsScalar>::HasDiv
-  };
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pdiv(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost), // rough estimate!
-    PacketAccess = scalar_quotient_op<LhsScalar,RhsScalar>::Vectorizable
-  };
-};
-
-
-
-/** \internal
-  * \brief Template functor to compute the and of two booleans
-  *
-  * \sa class CwiseBinaryOp, ArrayBase::operator&&
-  */
-struct scalar_boolean_and_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; }
-};
-template<> struct functor_traits<scalar_boolean_and_op> {
-  enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the or of two booleans
-  *
-  * \sa class CwiseBinaryOp, ArrayBase::operator||
-  */
-struct scalar_boolean_or_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; }
-};
-template<> struct functor_traits<scalar_boolean_or_op> {
-  enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
-  };
-};
-
-
-
-//---------- binary functors bound to a constant, thus appearing as a unary functor ----------
-
-/** \internal
-  * \brief Template functor to multiply a scalar by a fixed other one
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::operator*, MatrixBase::operator/
-  */
-/* NOTE why doing the pset1() in packetOp *is* an optimization ?
- * indeed it seems better to declare m_other as a Packet and do the pset1() once
- * in the constructor. However, in practice:
- *  - GCC does not like m_other as a Packet and generate a load every time it needs it
- *  - on the other hand GCC is able to moves the pset1() outside the loop :)
- *  - simpler code ;)
- * (ICC and gcc 4.4 seems to perform well in both cases, the issue is visible with y = a*x + b*y)
- */
-template<typename Scalar>
-struct scalar_multiple_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  // FIXME default copy constructors seems bugged with std::complex<>
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE scalar_multiple_op(const scalar_multiple_op& other) : m_other(other.m_other) { }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE scalar_multiple_op(const Scalar& other) : m_other(other) { }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a * m_other; }
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a, pset1<Packet>(m_other)); }
-  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_multiple_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
-
-template<typename Scalar1, typename Scalar2>
-struct scalar_multiple2_op {
-  typedef typename scalar_product_traits<Scalar1,Scalar2>::ReturnType result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_multiple2_op(const scalar_multiple2_op& other) : m_other(other.m_other) { }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_multiple2_op(const Scalar2& other) : m_other(other) { }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar1& a) const { return a * m_other; }
-  typename add_const_on_value_type<typename NumTraits<Scalar2>::Nested>::type m_other;
-};
-template<typename Scalar1,typename Scalar2>
-struct functor_traits<scalar_multiple2_op<Scalar1,Scalar2> >
-{ enum { Cost = NumTraits<Scalar1>::MulCost, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to divide a scalar by a fixed other one
-  *
-  * This functor is used to implement the quotient of a matrix by
-  * a scalar where the scalar type is not necessarily a floating point type.
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::operator/
-  */
-template<typename Scalar>
-struct scalar_quotient1_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  // FIXME default copy constructors seems bugged with std::complex<>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient1_op(const scalar_quotient1_op& other) : m_other(other.m_other) { }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient1_op(const Scalar& other) : m_other(other) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a / m_other; }
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pdiv(a, pset1<Packet>(m_other)); }
-  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_quotient1_op<Scalar> >
-{ enum { Cost = 2 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
-
-// In Eigen, any binary op (Product, CwiseBinaryOp) require the Lhs and Rhs to have the same scalar type, except for multiplication
-// where the mixing of different types is handled by scalar_product_traits
-// In particular, real * complex<real> is allowed.
-// FIXME move this to functor_traits adding a functor_default
-template<typename Functor> struct functor_is_product_like { enum { ret = 0 }; };
-template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
-template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_conj_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
-template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_quotient_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
-
-
-/** \internal
-  * \brief Template functor to add a scalar to a fixed other one
-  * \sa class CwiseUnaryOp, Array::operator+
-  */
-/* If you wonder why doing the pset1() in packetOp() is an optimization check scalar_multiple_op */
-template<typename Scalar>
-struct scalar_add_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  // FIXME default copy constructors seems bugged with std::complex<>
-  EIGEN_DEVICE_FUNC inline scalar_add_op(const scalar_add_op& other) : m_other(other.m_other) { }
-  EIGEN_DEVICE_FUNC inline scalar_add_op(const Scalar& other) : m_other(other) { }
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a + m_other; }
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::padd(a, pset1<Packet>(m_other)); }
-  const Scalar m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_add_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
-
-/** \internal
-  * \brief Template functor to subtract a fixed scalar to another one
-  * \sa class CwiseUnaryOp, Array::operator-, struct scalar_add_op, struct scalar_rsub_op
-  */
-template<typename Scalar>
-struct scalar_sub_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline scalar_sub_op(const scalar_sub_op& other) : m_other(other.m_other) { }
-  inline scalar_sub_op(const Scalar& other) : m_other(other) { }
-  inline Scalar operator() (const Scalar& a) const { return a - m_other; }
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::psub(a, pset1<Packet>(m_other)); }
-  const Scalar m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_sub_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
-
-/** \internal
-  * \brief Template functor to subtract a scalar to fixed another one
-  * \sa class CwiseUnaryOp, Array::operator-, struct scalar_add_op, struct scalar_sub_op
-  */
-template<typename Scalar>
-struct scalar_rsub_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline scalar_rsub_op(const scalar_rsub_op& other) : m_other(other.m_other) { }
-  inline scalar_rsub_op(const Scalar& other) : m_other(other) { }
-  inline Scalar operator() (const Scalar& a) const { return m_other - a; }
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::psub(pset1<Packet>(m_other), a); }
-  const Scalar m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_rsub_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
-
-/** \internal
-  * \brief Template functor to raise a scalar to a power
-  * \sa class CwiseUnaryOp, Cwise::pow
-  */
-template<typename Scalar>
-struct scalar_pow_op {
-  // FIXME default copy constructors seems bugged with std::complex<>
-  inline scalar_pow_op(const scalar_pow_op& other) : m_exponent(other.m_exponent) { }
-  inline scalar_pow_op(const Scalar& exponent) : m_exponent(exponent) {}
-  EIGEN_DEVICE_FUNC
-  inline Scalar operator() (const Scalar& a) const { return numext::pow(a, m_exponent); }
-  const Scalar m_exponent;
-};
-template<typename Scalar>
-struct functor_traits<scalar_pow_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to compute the quotient between a scalar and array entries.
-  * \sa class CwiseUnaryOp, Cwise::inverse()
-  */
-template<typename Scalar>
-struct scalar_inverse_mult_op {
-  scalar_inverse_mult_op(const Scalar& other) : m_other(other) {}
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return m_other / a; }
-  template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::pdiv(pset1<Packet>(m_other),a); }
-  Scalar m_other;
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_BINARY_FUNCTORS_H
--- a/Eigen/src/Core/functors/CMakeLists.txt
+++ b/Eigen/src/Core/functors/CMakeLists.txt
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Core_Functor_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Core_Functor_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/functors COMPONENT Devel
-  )
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -1,158 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_NULLARY_FUNCTORS_H
-#define EIGEN_NULLARY_FUNCTORS_H
-
-namespace Eigen {
-
-namespace internal {
-
-template<typename Scalar>
-struct scalar_constant_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const scalar_constant_op& other) : m_other(other.m_other) { }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const Scalar& other) : m_other(other) { }
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index, Index = 0) const { return m_other; }
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index, Index = 0) const { return internal::pset1<Packet>(m_other); }
-  const Scalar m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_constant_op<Scalar> >
-// FIXME replace this packet test by a safe one
-{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::Vectorizable, IsRepeatable = true }; };
-
-template<typename Scalar> struct scalar_identity_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_identity_op)
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const { return row==col ? Scalar(1) : Scalar(0); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_identity_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };
-
-template <typename Scalar, bool RandomAccess> struct linspaced_op_impl;
-
-// linear access for packet ops:
-// 1) initialization
-//   base = [low, ..., low] + ([step, ..., step] * [-size, ..., 0])
-// 2) each step (where size is 1 for coeff access or PacketSize for packet access)
-//   base += [size*step, ..., size*step]
-//
-// TODO: Perhaps it's better to initialize lazily (so not in the constructor but in packetOp)
-//       in order to avoid the padd() in operator() ?
-template <typename Scalar>
-struct linspaced_op_impl<Scalar,false>
-{
-  typedef typename packet_traits<Scalar>::type Packet;
-
-  linspaced_op_impl(const Scalar& low, const Scalar& step) :
-  m_low(low), m_step(step),
-  m_packetStep(pset1<Packet>(packet_traits<Scalar>::size*step)),
-  m_base(padd(pset1<Packet>(low), pmul(pset1<Packet>(step),plset<Scalar>(-packet_traits<Scalar>::size)))) {}
-
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const 
-  { 
-    m_base = padd(m_base, pset1<Packet>(m_step));
-    return m_low+Scalar(i)*m_step; 
-  }
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index) const { return m_base = padd(m_base,m_packetStep); }
-
-  const Scalar m_low;
-  const Scalar m_step;
-  const Packet m_packetStep;
-  mutable Packet m_base;
-};
-
-// random access for packet ops:
-// 1) each step
-//   [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
-template <typename Scalar>
-struct linspaced_op_impl<Scalar,true>
-{
-  typedef typename packet_traits<Scalar>::type Packet;
-
-  linspaced_op_impl(const Scalar& low, const Scalar& step) :
-  m_low(low), m_step(step),
-  m_lowPacket(pset1<Packet>(m_low)), m_stepPacket(pset1<Packet>(m_step)), m_interPacket(plset<Scalar>(0)) {}
-
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; }
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index i) const
-  { return internal::padd(m_lowPacket, pmul(m_stepPacket, padd(pset1<Packet>(Scalar(i)),m_interPacket))); }
-
-  const Scalar m_low;
-  const Scalar m_step;
-  const Packet m_lowPacket;
-  const Packet m_stepPacket;
-  const Packet m_interPacket;
-};
-
-// ----- Linspace functor ----------------------------------------------------------------
-
-// Forward declaration (we default to random access which does not really give
-// us a speed gain when using packet access but it allows to use the functor in
-// nested expressions).
-template <typename Scalar, bool RandomAccess = true> struct linspaced_op;
-template <typename Scalar, bool RandomAccess> struct functor_traits< linspaced_op<Scalar,RandomAccess> >
-{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::HasSetLinear, IsRepeatable = true }; };
-template <typename Scalar, bool RandomAccess> struct linspaced_op
-{
-  typedef typename packet_traits<Scalar>::type Packet;
-  linspaced_op(const Scalar& low, const Scalar& high, DenseIndex num_steps) : impl((num_steps==1 ? high : low), (num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1))) {}
-
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return impl(i); }
-
-  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
-  // there row==0 and col is used for the actual iteration.
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const 
-  {
-    eigen_assert(col==0 || row==0);
-    return impl(col + row);
-  }
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index i) const { return impl.packetOp(i); }
-
-  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
-  // there row==0 and col is used for the actual iteration.
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index row, Index col) const
-  {
-    eigen_assert(col==0 || row==0);
-    return impl.packetOp(col + row);
-  }
-
-  // This proxy object handles the actual required temporaries, the different
-  // implementations (random vs. sequential access) as well as the
-  // correct piping to size 2/4 packet operations.
-  const linspaced_op_impl<Scalar,RandomAccess> impl;
-};
-
-// all functors allow linear access, except scalar_identity_op. So we fix here a quick meta
-// to indicate whether a functor allows linear access, just always answering 'yes' except for
-// scalar_identity_op.
-// FIXME move this to functor_traits adding a functor_default
-template<typename Functor> struct functor_has_linear_access { enum { ret = 1 }; };
-template<typename Scalar> struct functor_has_linear_access<scalar_identity_op<Scalar> > { enum { ret = 0 }; };
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_NULLARY_FUNCTORS_H
--- a/Eigen/src/Core/functors/StlFunctors.h
+++ b/Eigen/src/Core/functors/StlFunctors.h
@@ -1,129 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_STL_FUNCTORS_H
-#define EIGEN_STL_FUNCTORS_H
-
-namespace Eigen {
-
-namespace internal {
-
-// default functor traits for STL functors:
-
-template<typename T>
-struct functor_traits<std::multiplies<T> >
-{ enum { Cost = NumTraits<T>::MulCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::divides<T> >
-{ enum { Cost = NumTraits<T>::MulCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::plus<T> >
-{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::minus<T> >
-{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::negate<T> >
-{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::logical_or<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::logical_and<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::logical_not<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::greater<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::less<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::greater_equal<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::less_equal<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::equal_to<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::not_equal_to<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::binder2nd<T> >
-{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::binder1st<T> >
-{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::unary_negate<T> >
-{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::binary_negate<T> >
-{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
-
-#ifdef EIGEN_STDEXT_SUPPORT
-
-template<typename T0,typename T1>
-struct functor_traits<std::project1st<T0,T1> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-template<typename T0,typename T1>
-struct functor_traits<std::project2nd<T0,T1> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-template<typename T0,typename T1>
-struct functor_traits<std::select2nd<std::pair<T0,T1> > >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-template<typename T0,typename T1>
-struct functor_traits<std::select1st<std::pair<T0,T1> > >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-template<typename T0,typename T1>
-struct functor_traits<std::unary_compose<T0,T1> >
-{ enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost, PacketAccess = false }; };
-
-template<typename T0,typename T1,typename T2>
-struct functor_traits<std::binary_compose<T0,T1,T2> >
-{ enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost + functor_traits<T2>::Cost, PacketAccess = false }; };
-
-#endif // EIGEN_STDEXT_SUPPORT
-
-// allow to add new functors and specializations of functor_traits from outside Eigen.
-// this macro is really needed because functor_traits must be specialized after it is declared but before it is used...
-#ifdef EIGEN_FUNCTORS_PLUGIN
-#include EIGEN_FUNCTORS_PLUGIN
-#endif
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_STL_FUNCTORS_H
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -1,396 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_UNARY_FUNCTORS_H
-#define EIGEN_UNARY_FUNCTORS_H
-
-namespace Eigen {
-
-namespace internal {
-
-/** \internal
-  * \brief Template functor to compute the opposite of a scalar
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::operator-
-  */
-template<typename Scalar> struct scalar_opposite_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_opposite_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return -a; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pnegate(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_opposite_op<Scalar> >
-{ enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasNegate };
-};
-
-/** \internal
-  * \brief Template functor to compute the absolute value of a scalar
-  *
-  * \sa class CwiseUnaryOp, Cwise::abs
-  */
-template<typename Scalar> struct scalar_abs_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_abs_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using std::abs; return abs(a); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pabs(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_abs_op<Scalar> >
-{
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasAbs
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the squared absolute value of a scalar
-  *
-  * \sa class CwiseUnaryOp, Cwise::abs2
-  */
-template<typename Scalar> struct scalar_abs2_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_abs2_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::abs2(a); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a,a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_abs2_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasAbs2 }; };
-
-/** \internal
-  * \brief Template functor to compute the conjugate of a complex value
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::conjugate()
-  */
-template<typename Scalar> struct scalar_conjugate_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_conjugate_op)
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::conj; return conj(a); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pconj(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_conjugate_op<Scalar> >
-{
-  enum {
-    Cost = NumTraits<Scalar>::IsComplex ? NumTraits<Scalar>::AddCost : 0,
-    PacketAccess = packet_traits<Scalar>::HasConj
-  };
-};
-
-/** \internal
-  * \brief Template functor to cast a scalar to another type
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::cast()
-  */
-template<typename Scalar, typename NewType>
-struct scalar_cast_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
-  typedef NewType result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const NewType operator() (const Scalar& a) const { return cast<Scalar, NewType>(a); }
-};
-template<typename Scalar, typename NewType>
-struct functor_traits<scalar_cast_op<Scalar,NewType> >
-{ enum { Cost = is_same<Scalar, NewType>::value ? 0 : NumTraits<NewType>::AddCost, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to extract the real part of a complex
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::real()
-  */
-template<typename Scalar>
-struct scalar_real_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_real_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::real(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_real_op<Scalar> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to extract the imaginary part of a complex
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::imag()
-  */
-template<typename Scalar>
-struct scalar_imag_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::imag(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_imag_op<Scalar> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to extract the real part of a complex as a reference
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::real()
-  */
-template<typename Scalar>
-struct scalar_real_ref_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_real_ref_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::real_ref(*const_cast<Scalar*>(&a)); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_real_ref_op<Scalar> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to extract the imaginary part of a complex as a reference
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::imag()
-  */
-template<typename Scalar>
-struct scalar_imag_ref_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_ref_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::imag_ref(*const_cast<Scalar*>(&a)); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_imag_ref_op<Scalar> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-/** \internal
-  *
-  * \brief Template functor to compute the exponential of a scalar
-  *
-  * \sa class CwiseUnaryOp, Cwise::exp()
-  */
-template<typename Scalar> struct scalar_exp_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_exp_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::exp; return exp(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::pexp(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_exp_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasExp }; };
-
-/** \internal
-  *
-  * \brief Template functor to compute the logarithm of a scalar
-  *
-  * \sa class CwiseUnaryOp, Cwise::log()
-  */
-template<typename Scalar> struct scalar_log_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_log_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::log; return log(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::plog(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_log_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog }; };
-
-
-/** \internal
-  * \brief Template functor to compute the square root of a scalar
-  * \sa class CwiseUnaryOp, Cwise::sqrt()
-  */
-template<typename Scalar> struct scalar_sqrt_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sqrt; return sqrt(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_sqrt_op<Scalar> >
-{ enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasSqrt
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the cosine of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::cos()
-  */
-template<typename Scalar> struct scalar_cos_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cos_op)
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { using std::cos; return cos(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::pcos(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_cos_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasCos
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the sine of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::sin()
-  */
-template<typename Scalar> struct scalar_sin_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sin_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sin; return sin(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::psin(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_sin_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasSin
-  };
-};
-
-
-/** \internal
-  * \brief Template functor to compute the tan of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::tan()
-  */
-template<typename Scalar> struct scalar_tan_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_tan_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::tan; return tan(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::ptan(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_tan_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasTan
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the arc cosine of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::acos()
-  */
-template<typename Scalar> struct scalar_acos_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_acos_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::acos; return acos(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::pacos(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_acos_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasACos
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the arc sine of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::asin()
-  */
-template<typename Scalar> struct scalar_asin_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_asin_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::asin; return asin(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::pasin(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_asin_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasASin
-  };
-};
-
-
-/** \internal
-  * \brief Template functor to compute the atan of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::atan()
-  */
-template<typename Scalar> struct scalar_atan_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_atan_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::atan; return atan(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::patan(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_atan_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasATan
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the inverse of a scalar
-  * \sa class CwiseUnaryOp, Cwise::inverse()
-  */
-template<typename Scalar>
-struct scalar_inverse_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_inverse_op)
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return Scalar(1)/a; }
-  template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::pdiv(pset1<Packet>(Scalar(1)),a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_inverse_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
-
-/** \internal
-  * \brief Template functor to compute the square of a scalar
-  * \sa class CwiseUnaryOp, Cwise::square()
-  */
-template<typename Scalar>
-struct scalar_square_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_square_op)
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a*a; }
-  template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a,a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_square_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
-
-/** \internal
-  * \brief Template functor to compute the cube of a scalar
-  * \sa class CwiseUnaryOp, Cwise::cube()
-  */
-template<typename Scalar>
-struct scalar_cube_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cube_op)
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a*a*a; }
-  template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a,pmul(a,a)); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_cube_op<Scalar> >
-{ enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
-
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_FUNCTORS_H
--- a/Eigen/src/Core/products/CoeffBasedProduct.h
+++ b/Eigen/src/Core/products/CoeffBasedProduct.h
@@ -85,12 +85,12 @@ struct traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> >
      Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
            | (EvalToRowMajor ? RowMajorBit : 0)
            | NestingFlags
-            | (CanVectorizeLhs ? (LhsFlags & AlignedBit) : 0)
-            | (CanVectorizeRhs ? (RhsFlags & AlignedBit) : 0)
+            | (LhsFlags & RhsFlags & AlignedBit)
            // TODO enable vectorization for mixed types
            | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0),

      CoeffReadCost = InnerSize == Dynamic ? Dynamic
+                    : InnerSize == 0 ? 0
                    : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
                      + (InnerSize - 1) * NumTraits<Scalar>::AddCost,

@@ -134,20 +134,18 @@ class CoeffBasedProduct
    };

    typedef internal::product_coeff_impl<CanVectorizeInner ? InnerVectorizedTraversal : DefaultTraversal,
-                                   Unroll ? InnerSize-1 : Dynamic,
+                                   Unroll ? InnerSize : Dynamic,
                                   _LhsNested, _RhsNested, Scalar> ScalarCoeffImpl;

    typedef CoeffBasedProduct<LhsNested,RhsNested,NestByRefBit> LazyCoeffBasedProductType;

  public:

-    EIGEN_DEVICE_FUNC
    inline CoeffBasedProduct(const CoeffBasedProduct& other)
      : Base(), m_lhs(other.m_lhs), m_rhs(other.m_rhs)
    {}

    template<typename Lhs, typename Rhs>
-    EIGEN_DEVICE_FUNC 
    inline CoeffBasedProduct(const Lhs& lhs, const Rhs& rhs)
      : m_lhs(lhs), m_rhs(rhs)
    {
@@ -160,10 +158,9 @@ class CoeffBasedProduct
        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
    }

-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
+    EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
+    EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }

-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
    {
      Scalar res;
@@ -174,7 +171,6 @@ class CoeffBasedProduct
    /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
     * which is why we don't set the LinearAccessBit.
     */
-    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
    {
      Scalar res;
@@ -189,33 +185,29 @@ class CoeffBasedProduct
    {
      PacketScalar res;
      internal::product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
-                              Unroll ? InnerSize-1 : Dynamic,
+                              Unroll ? InnerSize : Dynamic,
                              _LhsNested, _RhsNested, PacketScalar, LoadMode>
        ::run(row, col, m_lhs, m_rhs, res);
      return res;
    }

    // Implicit conversion to the nested type (trigger the evaluation of the product)
-    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE operator const PlainObject& () const
    {
      m_result.lazyAssign(*this);
      return m_result;
    }

-    EIGEN_DEVICE_FUNC const _LhsNested& lhs() const { return m_lhs; }
-    EIGEN_DEVICE_FUNC const _RhsNested& rhs() const { return m_rhs; }
+    const _LhsNested& lhs() const { return m_lhs; }
+    const _RhsNested& rhs() const { return m_rhs; }

-    EIGEN_DEVICE_FUNC
    const Diagonal<const LazyCoeffBasedProductType,0> diagonal() const
    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }

    template<int DiagonalIndex>
-    EIGEN_DEVICE_FUNC 
    const Diagonal<const LazyCoeffBasedProductType,DiagonalIndex> diagonal() const
    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }

-    EIGEN_DEVICE_FUNC
    const Diagonal<const LazyCoeffBasedProductType,Dynamic> diagonal(Index index) const
    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this).diagonal(index); }

@@ -248,11 +240,20 @@ template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
 struct product_coeff_impl<DefaultTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
 {
  typedef typename Lhs::Index Index;
-  EIGEN_DEVICE_FUNC 
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
  {
    product_coeff_impl<DefaultTraversal, UnrollingIndex-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, res);
-    res += lhs.coeff(row, UnrollingIndex) * rhs.coeff(UnrollingIndex, col);
+    res += lhs.coeff(row, UnrollingIndex-1) * rhs.coeff(UnrollingIndex-1, col);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename RetScalar>
+struct product_coeff_impl<DefaultTraversal, 1, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
+  {
+    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
  }
 };

@@ -260,10 +261,9 @@ template<typename Lhs, typename Rhs, typename RetScalar>
 struct product_coeff_impl<DefaultTraversal, 0, Lhs, Rhs, RetScalar>
 {
  typedef typename Lhs::Index Index;
-  EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
+  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, RetScalar &res)
  {
-    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
+    res = RetScalar(0);
  }
 };

@@ -271,13 +271,9 @@ template<typename Lhs, typename Rhs, typename RetScalar>
 struct product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar>
 {
  typedef typename Lhs::Index Index;
-  EIGEN_DEVICE_FUNC 
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar& res)
  {
-    eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
-    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
-      for(Index i = 1; i < lhs.cols(); ++i)
-        res += lhs.coeff(row, i) * rhs.coeff(i, col);
+    res = (lhs.row(row).transpose().cwiseProduct( rhs.col(col) )).sum();
  }
 };

@@ -307,6 +303,16 @@ struct product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet>
  }
 };

+template<typename Lhs, typename Rhs, typename RetScalar>
+struct product_coeff_impl<InnerVectorizedTraversal, 0, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, RetScalar &res)
+  {
+    res = 0;
+  }
+};
+
 template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
 struct product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
 {
@@ -316,7 +322,7 @@ struct product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, Re
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
  {
    Packet pres;
-    product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
+    product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
    res = predux(pres);
  }
 };
@@ -384,7 +390,7 @@ struct product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
  {
    product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
-    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
+    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex-1)), rhs.template packet<LoadMode>(UnrollingIndex-1, col), res);
  }
 };

@@ -395,12 +401,12 @@ struct product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
  {
    product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
-    res =  pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res);
+    res =  pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex-1), pset1<Packet>(rhs.coeff(UnrollingIndex-1, col)), res);
  }
 };

 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
+struct product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
  typedef typename Lhs::Index Index;
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
@@ -410,7 +416,7 @@ struct product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
 };

 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
+struct product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
  typedef typename Lhs::Index Index;
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
@@ -419,16 +425,35 @@ struct product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
  }
 };

+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Packet &res)
+  {
+    res = pset1<Packet>(0);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Packet &res)
+  {
+    res = pset1<Packet>(0);
+  }
+};
+
 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
  typedef typename Lhs::Index Index;
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
  {
-    eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
-      for(Index i = 1; i < lhs.cols(); ++i)
-        res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
+    res = pset1<Packet>(0);
+    for(Index i = 0; i < lhs.cols(); ++i)
+      res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
  }
 };

@@ -438,10 +463,9 @@ struct product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
  typedef typename Lhs::Index Index;
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
  {
-    eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
-    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
-      for(Index i = 1; i < lhs.cols(); ++i)
-        res =  pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
+    res = pset1<Packet>(0);
+    for(Index i = 0; i < lhs.cols(); ++i)
+      res =  pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
  }
 };

--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -23,8 +23,6 @@ template<
  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
 struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
 {
-  typedef gebp_traits<RhsScalar,LhsScalar> Traits;
-  
  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
  static EIGEN_STRONG_INLINE void run(
    Index rows, Index cols, Index depth,
@@ -53,8 +51,6 @@ template<
 struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
 {

-typedef gebp_traits<LhsScalar,RhsScalar> Traits;
-  
 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 static void run(Index rows, Index cols, Index depth,
  const LhsScalar* _lhs, Index lhsStride,
@@ -67,9 +63,11 @@ static void run(Index rows, Index cols, Index depth,
  const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
  const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);

+  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
  Index kc = blocking.kc();                   // cache block size along the K direction
  Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
-  Index nc = (std::min)(cols,blocking.nc());  // cache block size along the N direction
+  //Index nc = blocking.nc(); // cache block size along the N direction

  gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
  gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
@@ -82,68 +80,68 @@ static void run(Index rows, Index cols, Index depth,
    Index tid = omp_get_thread_num();
    Index threads = omp_get_num_threads();
    
-    LhsScalar* blockA = blocking.blockA();
-    eigen_internal_assert(blockA!=0);
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0);
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, w, sizeW, 0);
    
-    std::size_t sizeB = kc*nc;
-    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0);
-      
+    RhsScalar* blockB = blocking.blockB();
+    eigen_internal_assert(blockB!=0);
+
    // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
    for(Index k=0; k<depth; k+=kc)
    {
      const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A'

      // In order to reduce the chance that a thread has to wait for the other,
-      // let's start by packing B'.
-      pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc);
+      // let's start by packing A'.
+      pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc);

-      // Pack A_k to A' in a parallel fashion:
-      // each thread packs the sub block A_k,i to A'_i where i is the thread id.
+      // Pack B_k to B' in a parallel fashion:
+      // each thread packs the sub block B_k,j to B'_j where j is the thread id.

-      // However, before copying to A'_i, we have to make sure that no other thread is still using it,
+      // However, before copying to B'_j, we have to make sure that no other thread is still using it,
      // i.e., we test that info[tid].users equals 0.
      // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
      while(info[tid].users!=0) {}
      info[tid].users += threads;
-      
-      pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length);

-      // Notify the other threads that the part A'_i is ready to go.
+      pack_rhs(blockB+info[tid].rhs_start*actual_kc, &rhs(k,info[tid].rhs_start), rhsStride, actual_kc, info[tid].rhs_length);
+
+      // Notify the other threads that the part B'_j is ready to go.
      info[tid].sync = k;
-      
-      // Computes C_i += A' * B' per A'_i
+
+      // Computes C_i += A' * B' per B'_j
      for(Index shift=0; shift<threads; ++shift)
      {
-        Index i = (tid+shift)%threads;
+        Index j = (tid+shift)%threads;

-        // At this point we have to make sure that A'_i has been updated by the thread i,
+        // At this point we have to make sure that B'_j has been updated by the thread j,
        // we use testAndSetOrdered to mimic a volatile access.
        // However, no need to wait for the B' part which has been updated by the current thread!
        if(shift>0)
-          while(info[i].sync!=k) {}
-        gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
+          while(info[j].sync!=k) {}
+
+        gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w);
      }

-      // Then keep going as usual with the remaining B'
-      for(Index j=nc; j<cols; j+=nc)
+      // Then keep going as usual with the remaining A'
+      for(Index i=mc; i<rows; i+=mc)
      {
-        const Index actual_nc = (std::min)(j+nc,cols)-j;
+        const Index actual_mc = (std::min)(i+mc,rows)-i;

-        // pack B_k,j to B'
-        pack_rhs(blockB, &rhs(k,j), rhsStride, actual_kc, actual_nc);
+        // pack A_i,k to A'
+        pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc);

-        // C_j += A' * B'
-        gebp(res+j*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
+        // C_i += A' * B'
+        gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0, w);
      }

-      // Release all the sub blocks A'_i of A' for the current thread,
+      // Release all the sub blocks B'_j of B' for the current thread,
      // i.e., we simply decrement the number of users by 1
-      #pragma omp critical
-      {
-      for(Index i=0; i<threads; ++i)
+      for(Index j=0; j<threads; ++j)
        #pragma omp atomic
-        --(info[i].users);
-      }
+        --(info[j].users);
    }
  }
  else
@@ -153,34 +151,38 @@ static void run(Index rows, Index cols, Index depth,

    // this is the sequential version!
    std::size_t sizeA = kc*mc;
-    std::size_t sizeB = kc*nc;
+    std::size_t sizeB = kc*cols;
+    std::size_t sizeW = kc*Traits::WorkSpaceFactor;

    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockW, sizeW, blocking.blockW());

    // For each horizontal panel of the rhs, and corresponding panel of the lhs...
+    // (==GEMM_VAR1)
    for(Index k2=0; k2<depth; k2+=kc)
    {
      const Index actual_kc = (std::min)(k2+kc,depth)-k2;

      // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
-      // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
-      // Note that this panel will be read as many times as the number of blocks in the rhs's
-      // horizontal panel which is, in practice, a very low number.
-      pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows);
+      // => Pack rhs's panel into a sequential chunk of memory (L2 caching)
+      // Note that this panel will be read as many times as the number of blocks in the lhs's
+      // vertical panel which is, in practice, a very low number.
+      pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);

-      // For each kc x nc block of the rhs's horizontal panel...
-      for(Index j2=0; j2<cols; j2+=nc)
+      // For each mc x kc block of the lhs's vertical panel...
+      // (==GEPP_VAR1)
+      for(Index i2=0; i2<rows; i2+=mc)
      {
-        const Index actual_nc = (std::min)(j2+nc,cols)-j2;
+        const Index actual_mc = (std::min)(i2+mc,rows)-i2;

-        // We pack the rhs's block into a sequential chunk of memory (L2 caching)
+        // We pack the lhs's block into a sequential chunk of memory (L1 caching)
        // Note that this block will be read a very high number of times, which is equal to the number of
-        // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
-        pack_rhs(blockB, &rhs(k2,j2), rhsStride, actual_kc, actual_nc);
+        // micro vertical panel of the large rhs's panel (e.g., cols/4 times).
+        pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);

-        // Everything is packed, we can now call the panel * block kernel:
-        gebp(res+j2*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
+        // Everything is packed, we can now call the block * panel kernel:
+        gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
      }
    }
  }
@@ -201,13 +203,14 @@ struct traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
 template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType>
 struct gemm_functor
 {
-  gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, BlockingType& blocking)
+  gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha,
+                  BlockingType& blocking)
    : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking)
  {}

  void initParallelSession() const
  {
-    m_blocking.allocateA();
+    m_blocking.allocateB();
  }

  void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const
@@ -221,8 +224,6 @@ struct gemm_functor
              (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
              m_actualAlpha, m_blocking, info);
  }
-  
-  typedef typename Gemm::Traits Traits;

  protected:
    const Lhs& m_lhs;
@@ -244,6 +245,7 @@ class level3_blocking
  protected:
    LhsScalar* m_blockA;
    RhsScalar* m_blockB;
+    RhsScalar* m_blockW;

    DenseIndex m_mc;
    DenseIndex m_nc;
@@ -252,7 +254,7 @@ class level3_blocking
  public:

    level3_blocking()
-      : m_blockA(0), m_blockB(0), m_mc(0), m_nc(0), m_kc(0)
+      : m_blockA(0), m_blockB(0), m_blockW(0), m_mc(0), m_nc(0), m_kc(0)
    {}

    inline DenseIndex mc() const { return m_mc; }
@@ -261,6 +263,7 @@ class level3_blocking

    inline LhsScalar* blockA() { return m_blockA; }
    inline RhsScalar* blockB() { return m_blockB; }
+    inline RhsScalar* blockW() { return m_blockW; }
 };

 template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
@@ -279,25 +282,29 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
    enum {
      SizeA = ActualRows * MaxDepth,
-      SizeB = ActualCols * MaxDepth
+      SizeB = ActualCols * MaxDepth,
+      SizeW = MaxDepth * Traits::WorkSpaceFactor
    };

-    EIGEN_ALIGN_DEFAULT LhsScalar m_staticA[SizeA];
-    EIGEN_ALIGN_DEFAULT RhsScalar m_staticB[SizeB];
+    EIGEN_ALIGN16 LhsScalar m_staticA[SizeA];
+    EIGEN_ALIGN16 RhsScalar m_staticB[SizeB];
+    EIGEN_ALIGN16 RhsScalar m_staticW[SizeW];

  public:

-    gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, bool /*full_rows*/ = false)
+    gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/)
    {
      this->m_mc = ActualRows;
      this->m_nc = ActualCols;
      this->m_kc = MaxDepth;
      this->m_blockA = m_staticA;
      this->m_blockB = m_staticB;
+      this->m_blockW = m_staticW;
    }

    inline void allocateA() {}
    inline void allocateB() {}
+    inline void allocateW() {}
    inline void allocateAll() {}
 };

@@ -316,28 +323,20 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M

    DenseIndex m_sizeA;
    DenseIndex m_sizeB;
+    DenseIndex m_sizeW;

  public:

-    gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, bool full_rows = false)
+    gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth)
    {
      this->m_mc = Transpose ? cols : rows;
      this->m_nc = Transpose ? rows : cols;
      this->m_kc = depth;

-      if(full_rows)
-      {
-        DenseIndex m = this->m_mc;
-        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc);
-      }
-      else // full columns
-      {
-        DenseIndex n = this->m_nc;
-        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n);
-      }
-      
+      computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc);
      m_sizeA = this->m_mc * this->m_kc;
      m_sizeB = this->m_kc * this->m_nc;
+      m_sizeW = this->m_kc*Traits::WorkSpaceFactor;
    }

    void allocateA()
@@ -352,16 +351,24 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
        this->m_blockB = aligned_new<RhsScalar>(m_sizeB);
    }

+    void allocateW()
+    {
+      if(this->m_blockW==0)
+        this->m_blockW = aligned_new<RhsScalar>(m_sizeW);
+    }
+
    void allocateAll()
    {
      allocateA();
      allocateB();
+      allocateW();
    }

    ~gemm_blocking_space()
    {
      aligned_delete(this->m_blockA, m_sizeA);
      aligned_delete(this->m_blockB, m_sizeB);
+      aligned_delete(this->m_blockW, m_sizeW);
    }
 };

@@ -386,37 +393,7 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
      typedef internal::scalar_product_op<LhsScalar,RhsScalar> BinOp;
      EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar);
    }
-    
-    template<typename Dest>
-    inline void evalTo(Dest& dst) const
-    {
-      if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
-        dst.noalias() = m_lhs .lazyProduct( m_rhs );
-      else
-      {
-        dst.setZero();
-        scaleAndAddTo(dst,Scalar(1));
-      }
-    }

-    template<typename Dest>
-    inline void addTo(Dest& dst) const
-    {
-      if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
-        dst.noalias() += m_lhs .lazyProduct( m_rhs );
-      else
-        scaleAndAddTo(dst,Scalar(1));
-    }
-
-    template<typename Dest>
-    inline void subTo(Dest& dst) const
-    {
-      if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
-        dst.noalias() -= m_lhs .lazyProduct( m_rhs );
-      else
-        scaleAndAddTo(dst,Scalar(-1));
-    }
-    
    template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
    {
      eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
@@ -439,7 +416,7 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
          (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
        _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor;

-      BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true);
+      BlockingType blocking(dst.rows(), dst.cols(), lhs.cols());

      internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit);
    }
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -73,8 +73,11 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
    if(mc > Traits::nr)
      mc = (mc/Traits::nr)*Traits::nr;

+    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+    std::size_t sizeB = sizeW + kc*size;
    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
-    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0);
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, allocatedBlockB, sizeB, 0);
+    RhsScalar* blockB = allocatedBlockB + sizeW;
    
    gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
    gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
@@ -100,15 +103,15 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
        //  3 - after the diagonal => processed with gebp or skipped
        if (UpLo==Lower)
          gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha,
-               -1, -1, 0, 0);
+               -1, -1, 0, 0, allocatedBlockB);

-        sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
+        sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB);

        if (UpLo==Upper)
        {
          Index j2 = i2+actual_mc;
          gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha,
-               -1, -1, 0, 0);
+               -1, -1, 0, 0, allocatedBlockB);
        }
      }
    }
@@ -133,7 +136,7 @@ struct tribb_kernel
  enum {
    BlockSize  = EIGEN_PLAIN_ENUM_MAX(mr,nr)
  };
-  void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
+  void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha, RhsScalar* workspace)
  {
    gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
    Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
@@ -147,7 +150,7 @@ struct tribb_kernel

      if(UpLo==Upper)
        gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0);
+                    -1, -1, 0, 0, workspace);

      // selfadjoint micro block
      {
@@ -155,7 +158,7 @@ struct tribb_kernel
        buffer.setZero();
        // 1 - apply the kernel on the temporary buffer
        gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0);
+                    -1, -1, 0, 0, workspace);
        // 2 - triangular accumulation
        for(Index j1=0; j1<actualBlockSize; ++j1)
        {
@@ -170,7 +173,7 @@ struct tribb_kernel
      {
        Index i = j+actualBlockSize;
        gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0);
+                    -1, -1, 0, 0, workspace);
      }
    }
  }
@@ -265,8 +268,6 @@ template<typename MatrixType, unsigned int UpLo>
 template<typename ProductDerived, typename _Lhs, typename _Rhs>
 TriangularView<MatrixType,UpLo>& TriangularView<MatrixType,UpLo>::assignProduct(const ProductBase<ProductDerived, _Lhs,_Rhs>& prod, const Scalar& alpha)
 {
-  eigen_assert(m_matrix.rows() == prod.rows() && m_matrix.cols() == prod.cols());
-
  general_product_to_triangular_selector<MatrixType, ProductDerived, UpLo, (_Lhs::ColsAtCompileTime==1) || (_Rhs::RowsAtCompileTime==1)>::run(m_matrix.const_cast_derived(), prod.derived(), alpha);
  
  return *this;
--- a/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
@@ -53,8 +53,6 @@ template< \
  int RhsStorageOrder, bool ConjugateRhs> \
 struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \
 { \
-typedef gebp_traits<EIGTYPE,EIGTYPE> Traits; \
-\
 static void run(Index rows, Index cols, Index depth, \
  const EIGTYPE* _lhs, Index lhsStride, \
  const EIGTYPE* _rhs, Index rhsStride, \
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -26,34 +26,6 @@ namespace internal {
 *  |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
 *  |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
 *  |cplx |real |real | optimal case, vectorization possible via real-cplx mul
- *
- * Accesses to the matrix coefficients follow the following logic:
- *
- * - if all columns have the same alignment then
- *   - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case)
- *   - otherwise perform unaligned loads only (-> NoneAligned case)
- * - otherwise
- *   - if even columns have the same alignment then
- *     // odd columns are guaranteed to have the same alignment too
- *     - if even or odd columns have the same alignment as the result, then
- *       // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double)
- *       - perform half aligned and half unaligned loads (-> EvenAligned case)
- *     - otherwise perform unaligned loads only (-> NoneAligned case)
- *   - otherwise, if the register size is 4 scalars (e.g., SSE with float) then
- *     - one over 4 consecutive columns is guaranteed to be aligned with the result vector,
- *       perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case)
- *       // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h
- *   - otherwise,
- *     // if we get here, this means the register size is greater than 4 (e.g., AVX with floats),
- *     // we currently fall back to the NoneAligned case
- *
- * The same reasoning apply for the transposed case.
- * 
- * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
- * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
- * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
- * compared to unaligned loads on a 4 byte boundary.
- *
 */
 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
 struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
@@ -80,8 +52,7 @@ EIGEN_DONT_INLINE static void run(
  Index rows, Index cols,
  const LhsScalar* lhs, Index lhsStride,
  const RhsScalar* rhs, Index rhsIncr,
-        ResScalar* res, Index resIncr,
-  RhsScalar alpha);
+  ResScalar* res, Index resIncr, RhsScalar alpha);
 };

 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
@@ -89,10 +60,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
  Index rows, Index cols,
  const LhsScalar* lhs, Index lhsStride,
  const RhsScalar* rhs, Index rhsIncr,
-        ResScalar* res, Index resIncr,
-  RhsScalar alpha)
+  ResScalar* res, Index resIncr, RhsScalar alpha)
 {
-  EIGEN_UNUSED_VARIABLE(resIncr);
+  EIGEN_UNUSED_VARIABLE(resIncr)
  eigen_internal_assert(resIncr==1);
  #ifdef _EIGEN_ACCUMULATE_PACKETS
  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
@@ -141,12 +111,6 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
    alignedSize = 0;
    alignedStart = 0;
  }
-  else if(LhsPacketSize > 4)
-  {
-    // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
-    // Currently, it seems to be better to perform unaligned loads anyway
-    alignmentPattern = NoneAligned;
-  }
  else if (LhsPacketSize>1)
  {
    eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
@@ -351,7 +315,7 @@ EIGEN_DONT_INLINE static void run(
  Index rows, Index cols,
  const LhsScalar* lhs, Index lhsStride,
  const RhsScalar* rhs, Index rhsIncr,
-        ResScalar* res, Index resIncr,
+  ResScalar* res, Index resIncr,
  ResScalar alpha);
 };

@@ -365,7 +329,6 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
 {
  EIGEN_UNUSED_VARIABLE(rhsIncr);
  eigen_internal_assert(rhsIncr==1);
-  
  #ifdef _EIGEN_ACCUMULATE_PACKETS
  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
  #endif
@@ -411,11 +374,6 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
    alignedSize = 0;
    alignedStart = 0;
  }
-  else if(LhsPacketSize > 4)
-  {
-    // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
-    alignmentPattern = NoneAligned;
-  }
  else if (LhsPacketSize>1)
  {
    eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
@@ -453,7 +411,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
  Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
  for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
  {
-    EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
+    EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
    ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);

    // this helps the compiler generating good binary code
@@ -562,7 +520,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
  {
    for (Index i=start; i<end; ++i)
    {
-      EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
+      EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
      ResPacket ptmp0 = pset1<ResPacket>(tmp0);
      const LhsScalar* lhs0 = lhs + i*lhsStride;
      // process first unaligned result's coeffs
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -73,13 +73,13 @@ namespace internal {

 template<typename Index> struct GemmParallelInfo
 {
-  GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
+  GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0) {}

  int volatile sync;
  int volatile users;

-  Index lhs_start;
-  Index lhs_length;
+  Index rhs_start;
+  Index rhs_length;
 };

 template<bool Condition, typename Functor, typename Index>
@@ -107,7 +107,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
  if((!Condition) || (omp_get_num_threads()>1))
    return func(0,rows, 0,cols);

-  Index size = transpose ? rows : cols;
+  Index size = transpose ? cols : rows;

  // 2- compute the maximal number of threads from the size of the product:
  // FIXME this has to be fine tuned
@@ -125,26 +125,30 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
  if(transpose)
    std::swap(rows,cols);

-  Index blockCols = (cols / threads) & ~Index(0x3);
-  Index blockRows = (rows / threads);
-  blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
-  
  GemmParallelInfo<Index>* info = new GemmParallelInfo<Index>[threads];

  #pragma omp parallel num_threads(threads)
  {
    Index i = omp_get_thread_num();
+    // Note that the actual number of threads might be lower than the number of request ones.
+    Index actual_threads = omp_get_num_threads();
+    
+    Index blockCols = (cols / actual_threads) & ~Index(0x3);
+    Index blockRows = (rows / actual_threads) & ~Index(0x7);
+    
    Index r0 = i*blockRows;
-    Index actualBlockRows = (i+1==threads) ? rows-r0 : blockRows;
+    Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows;

    Index c0 = i*blockCols;
-    Index actualBlockCols = (i+1==threads) ? cols-c0 : blockCols;
+    Index actualBlockCols = (i+1==actual_threads) ? cols-c0 : blockCols;

-    info[i].lhs_start = r0;
-    info[i].lhs_length = actualBlockRows;
+    info[i].rhs_start = c0;
+    info[i].rhs_length = actualBlockCols;

-    if(transpose) func(c0, actualBlockCols, 0, rows, info);
-    else          func(0, rows, c0, actualBlockCols, info);
+    if(transpose)
+      func(0, cols, r0, actualBlockRows, info);
+    else
+      func(r0, actualBlockRows, 0,cols, info);
  }

  delete[] info;
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -15,7 +15,7 @@ namespace Eigen {
 namespace internal {

 // pack a selfadjoint block diagonal for use with the gebp_kernel
-template<typename Scalar, typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder>
 struct symm_pack_lhs
 {
  template<int BlockRows> inline
@@ -45,32 +45,25 @@ struct symm_pack_lhs
  }
  void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
  {
-    enum { PacketSize = packet_traits<Scalar>::size };
    const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
    Index count = 0;
-    //Index peeled_mc3 = (rows/Pack1)*Pack1;
-    
-    const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
-    const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
-    const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
-    
-    if(Pack1>=3*PacketSize)
-      for(Index i=0; i<peeled_mc3; i+=3*PacketSize)
-        pack<3*PacketSize>(blockA, lhs, cols, i, count);
-    
-    if(Pack1>=2*PacketSize)
-      for(Index i=peeled_mc3; i<peeled_mc2; i+=2*PacketSize)
-        pack<2*PacketSize>(blockA, lhs, cols, i, count);
-    
-    if(Pack1>=1*PacketSize)
-      for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize)
-        pack<1*PacketSize>(blockA, lhs, cols, i, count);
+    Index peeled_mc = (rows/Pack1)*Pack1;
+    for(Index i=0; i<peeled_mc; i+=Pack1)
+    {
+      pack<Pack1>(blockA, lhs, cols, i, count);
+    }
+
+    if(rows-peeled_mc>=Pack2)
+    {
+      pack<Pack2>(blockA, lhs, cols, peeled_mc, count);
+      peeled_mc += Pack2;
+    }

    // do the same with mr==1
-    for(Index i=peeled_mc1; i<rows; i++)
+    for(Index i=peeled_mc; i<rows; i++)
    {
      for(Index k=0; k<i; k++)
-        blockA[count++] = lhs(i, k);                   // normal
+        blockA[count++] = lhs(i, k);              // normal

      blockA[count++] = numext::real(lhs(i, i));       // real (diagonal)

@@ -89,8 +82,7 @@ struct symm_pack_rhs
    Index end_k = k2 + rows;
    Index count = 0;
    const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride);
-    Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
-    Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+    Index packet_cols = (cols/nr)*nr;

    // first part: normal case
    for(Index j2=0; j2<k2; j2+=nr)
@@ -99,151 +91,79 @@ struct symm_pack_rhs
      {
        blockB[count+0] = rhs(k,j2+0);
        blockB[count+1] = rhs(k,j2+1);
-        if (nr>=4)
+        if (nr==4)
        {
          blockB[count+2] = rhs(k,j2+2);
          blockB[count+3] = rhs(k,j2+3);
        }
-        if (nr>=8)
-        {
-          blockB[count+4] = rhs(k,j2+4);
-          blockB[count+5] = rhs(k,j2+5);
-          blockB[count+6] = rhs(k,j2+6);
-          blockB[count+7] = rhs(k,j2+7);
-        }
        count += nr;
      }
    }

    // second part: diagonal block
-    Index end8 = nr>=8 ? (std::min)(k2+rows,packet_cols8) : k2;
-    if(nr>=8)
+    for(Index j2=k2; j2<(std::min)(k2+rows,packet_cols); j2+=nr)
    {
-      for(Index j2=k2; j2<end8; j2+=8)
+      // again we can split vertically in three different parts (transpose, symmetric, normal)
+      // transpose
+      for(Index k=k2; k<j2; k++)
      {
-        // again we can split vertically in three different parts (transpose, symmetric, normal)
-        // transpose
-        for(Index k=k2; k<j2; k++)
+        blockB[count+0] = numext::conj(rhs(j2+0,k));
+        blockB[count+1] = numext::conj(rhs(j2+1,k));
+        if (nr==4)
        {
-          blockB[count+0] = numext::conj(rhs(j2+0,k));
-          blockB[count+1] = numext::conj(rhs(j2+1,k));
          blockB[count+2] = numext::conj(rhs(j2+2,k));
          blockB[count+3] = numext::conj(rhs(j2+3,k));
-          blockB[count+4] = numext::conj(rhs(j2+4,k));
-          blockB[count+5] = numext::conj(rhs(j2+5,k));
-          blockB[count+6] = numext::conj(rhs(j2+6,k));
-          blockB[count+7] = numext::conj(rhs(j2+7,k));
-          count += 8;
-        }
-        // symmetric
-        Index h = 0;
-        for(Index k=j2; k<j2+8; k++)
-        {
-          // normal
-          for (Index w=0 ; w<h; ++w)
-            blockB[count+w] = rhs(k,j2+w);
-
-          blockB[count+h] = numext::real(rhs(k,k));
-
-          // transpose
-          for (Index w=h+1 ; w<8; ++w)
-            blockB[count+w] = numext::conj(rhs(j2+w,k));
-          count += 8;
-          ++h;
-        }
-        // normal
-        for(Index k=j2+8; k<end_k; k++)
-        {
-          blockB[count+0] = rhs(k,j2+0);
-          blockB[count+1] = rhs(k,j2+1);
-          blockB[count+2] = rhs(k,j2+2);
-          blockB[count+3] = rhs(k,j2+3);
-          blockB[count+4] = rhs(k,j2+4);
-          blockB[count+5] = rhs(k,j2+5);
-          blockB[count+6] = rhs(k,j2+6);
-          blockB[count+7] = rhs(k,j2+7);
-          count += 8;
        }
+        count += nr;
      }
-    }
-    if(nr>=4)
-    {
-      for(Index j2=end8; j2<(std::min)(k2+rows,packet_cols4); j2+=4)
+      // symmetric
+      Index h = 0;
+      for(Index k=j2; k<j2+nr; k++)
      {
-        // again we can split vertically in three different parts (transpose, symmetric, normal)
-        // transpose
-        for(Index k=k2; k<j2; k++)
-        {
-          blockB[count+0] = numext::conj(rhs(j2+0,k));
-          blockB[count+1] = numext::conj(rhs(j2+1,k));
-          blockB[count+2] = numext::conj(rhs(j2+2,k));
-          blockB[count+3] = numext::conj(rhs(j2+3,k));
-          count += 4;
-        }
-        // symmetric
-        Index h = 0;
-        for(Index k=j2; k<j2+4; k++)
-        {
-          // normal
-          for (Index w=0 ; w<h; ++w)
-            blockB[count+w] = rhs(k,j2+w);
-
-          blockB[count+h] = numext::real(rhs(k,k));
-
-          // transpose
-          for (Index w=h+1 ; w<4; ++w)
-            blockB[count+w] = numext::conj(rhs(j2+w,k));
-          count += 4;
-          ++h;
-        }
        // normal
-        for(Index k=j2+4; k<end_k; k++)
+        for (Index w=0 ; w<h; ++w)
+          blockB[count+w] = rhs(k,j2+w);
+
+        blockB[count+h] = numext::real(rhs(k,k));
+
+        // transpose
+        for (Index w=h+1 ; w<nr; ++w)
+          blockB[count+w] = numext::conj(rhs(j2+w,k));
+        count += nr;
+        ++h;
+      }
+      // normal
+      for(Index k=j2+nr; k<end_k; k++)
+      {
+        blockB[count+0] = rhs(k,j2+0);
+        blockB[count+1] = rhs(k,j2+1);
+        if (nr==4)
        {
-          blockB[count+0] = rhs(k,j2+0);
-          blockB[count+1] = rhs(k,j2+1);
          blockB[count+2] = rhs(k,j2+2);
          blockB[count+3] = rhs(k,j2+3);
-          count += 4;
        }
+        count += nr;
      }
    }

    // third part: transposed
-    if(nr>=8)
+    for(Index j2=k2+rows; j2<packet_cols; j2+=nr)
    {
-      for(Index j2=k2+rows; j2<packet_cols8; j2+=8)
+      for(Index k=k2; k<end_k; k++)
      {
-        for(Index k=k2; k<end_k; k++)
+        blockB[count+0] = numext::conj(rhs(j2+0,k));
+        blockB[count+1] = numext::conj(rhs(j2+1,k));
+        if (nr==4)
        {
-          blockB[count+0] = numext::conj(rhs(j2+0,k));
-          blockB[count+1] = numext::conj(rhs(j2+1,k));
          blockB[count+2] = numext::conj(rhs(j2+2,k));
          blockB[count+3] = numext::conj(rhs(j2+3,k));
-          blockB[count+4] = numext::conj(rhs(j2+4,k));
-          blockB[count+5] = numext::conj(rhs(j2+5,k));
-          blockB[count+6] = numext::conj(rhs(j2+6,k));
-          blockB[count+7] = numext::conj(rhs(j2+7,k));
-          count += 8;
-        }
-      }
-    }
-    if(nr>=4)
-    {
-      for(Index j2=(std::max)(packet_cols8,k2+rows); j2<packet_cols4; j2+=4)
-      {
-        for(Index k=k2; k<end_k; k++)
-        {
-          blockB[count+0] = numext::conj(rhs(j2+0,k));
-          blockB[count+1] = numext::conj(rhs(j2+1,k));
-          blockB[count+2] = numext::conj(rhs(j2+2,k));
-          blockB[count+3] = numext::conj(rhs(j2+3,k));
-          count += 4;
        }
+        count += nr;
      }
    }

    // copy the remaining columns one at a time (=> the same with nr==1)
-    for(Index j2=packet_cols4; j2<cols; ++j2)
+    for(Index j2=packet_cols; j2<cols; ++j2)
    {
      // transpose
      Index half = (std::min)(end_k,j2);
@@ -341,10 +261,11 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
    // kc must smaller than mc
    kc = (std::min)(kc,mc);

-    std::size_t sizeB = kc*cols;
+    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+    std::size_t sizeB = sizeW + kc*cols;
    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
    ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
-    Scalar* blockB = allocatedBlockB;
+    Scalar* blockB = allocatedBlockB + sizeW;

    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
    symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
@@ -427,10 +348,11 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
    Index mc = rows;  // cache block size along the M direction
    Index nc = cols;  // cache block size along the N direction
    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
-    std::size_t sizeB = kc*cols;
+    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+    std::size_t sizeB = sizeW + kc*cols;
    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
    ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
-    Scalar* blockB = allocatedBlockB;
+    Scalar* blockB = allocatedBlockB + sizeW;

    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
@@ -500,11 +422,11 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
      NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
      internal::traits<Dest>::Flags&RowMajorBit  ? RowMajor : ColMajor>
      ::run(
-        lhs.rows(), rhs.cols(),                     // sizes
-        &lhs.coeffRef(0,0),    lhs.outerStride(),   // lhs info
-        &rhs.coeffRef(0,0),    rhs.outerStride(),   // rhs info
-        &dst.coeffRef(0,0), dst.outerStride(),      // result info
-        actualAlpha                                 // alpha
+        lhs.rows(), rhs.cols(),                 // sizes
+        &lhs.coeffRef(0,0),    lhs.outerStride(),  // lhs info
+        &rhs.coeffRef(0,0),    rhs.outerStride(),  // rhs info
+        &dst.coeffRef(0,0), dst.outerStride(),  // result info
+        actualAlpha                             // alpha
      );
  }
 };
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -113,9 +113,9 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd

    for (size_t i=starti; i<alignedStart; ++i)
    {
-      res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
-      t2 += cj1.pmul(A0[i], rhs[i]);
-      t3 += cj1.pmul(A1[i], rhs[i]);
+      res[i] += t0 * A0[i] + t1 * A1[i];
+      t2 += numext::conj(A0[i]) * rhs[i];
+      t3 += numext::conj(A1[i]) * rhs[i];
    }
    // Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up)
    // gcc 4.2 does this optimization automatically.
@@ -218,7 +218,7 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
    if(!EvalToDest)
    {
      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      Index size = dest.size();
+      int size = dest.size();
      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
      #endif
      MappedDest(actualDestPtr, dest.size()) = dest;
@@ -227,7 +227,7 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
    if(!UseRhs)
    {
      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      Index size = rhs.size();
+      int size = rhs.size();
      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
      #endif
      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -125,9 +125,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,

    std::size_t sizeA = kc*mc;
    std::size_t sizeB = kc*cols;
+    std::size_t sizeW = kc*Traits::WorkSpaceFactor;

    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());

    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
    triangularBuffer.setZero();
@@ -185,7 +187,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
          pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);

          gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
-                      actualPanelWidth, actual_kc, 0, blockBOffset);
+                      actualPanelWidth, actual_kc, 0, blockBOffset, blockW);

          // GEBP with remaining micro panel
          if (lengthTarget>0)
@@ -195,7 +197,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
            pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);

            gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
-                        actualPanelWidth, actual_kc, 0, blockBOffset);
+                        actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
          }
        }
      }
@@ -209,7 +211,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
          gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
            (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);

-          gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0);
+          gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
        }
      }
    }
@@ -263,10 +265,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction

    std::size_t sizeA = kc*mc;
-    std::size_t sizeB = kc*cols+EIGEN_ALIGN_BYTES/sizeof(Scalar);
+    std::size_t sizeB = kc*cols;
+    std::size_t sizeW = kc*Traits::WorkSpaceFactor;

    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());

    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
    triangularBuffer.setZero();
@@ -300,7 +304,6 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
      Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;

      Scalar* geb = blockB+ts*ts;
-      geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar));

      pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs);

@@ -354,13 +357,14 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
                        actual_mc, panelLength, actualPanelWidth,
                        alpha,
                        actual_kc, actual_kc,  // strides
-                        blockOffset, blockOffset);// offsets
+                        blockOffset, blockOffset,// offsets
+                        blockW); // workspace
          }
        }
        gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
                    blockA, geb, actual_mc, actual_kc, rs,
                    alpha,
-                    -1, -1, 0, 0);
+                    -1, -1, 0, 0, blockW);
      }
    }
  }
--- a/Show More
+++ b/Show More