bump to 3.3-alpha1

Workaround wrong instanciation made by VS2010
Fix returned index type of inner iterators of sparse blocks.
2026-04-10 11:34:33 +08:00 · 2015-09-04 17:26:36 +02:00 · 2015-09-04 15:25:58 +02:00 · 2015-09-03 15:07:35 +02:00 · 2015-09-03 14:53:51 +02:00 · 2015-09-03 14:14:54 +02:00
433 changed files with 19636 additions and 5997 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -147,6 +147,12 @@ if(NOT MSVC)
  ei_add_cxx_compiler_flag("-Wenum-conversion")
  ei_add_cxx_compiler_flag("-Wc++11-extensions")
  
+  # -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6
+  # if(NOT CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0.0"))
+  if(NOT CMAKE_COMPILER_IS_GNUCXX)
+    ei_add_cxx_compiler_flag("-Wshadow")
+  endif()
+  
  ei_add_cxx_compiler_flag("-Wno-psabi")
  ei_add_cxx_compiler_flag("-Wno-variadic-macros")
  ei_add_cxx_compiler_flag("-Wno-long-long")
@@ -168,6 +174,11 @@ if(NOT MSVC)
  else()
    ei_add_cxx_compiler_flag("-ansi")
  endif()
+
+  if(ANDROID_NDK)
+    ei_add_cxx_compiler_flag("-pie")
+    ei_add_cxx_compiler_flag("-fPIE")
+  endif()
  
  set(CMAKE_REQUIRED_FLAGS "")

@@ -208,7 +219,7 @@ if(NOT MSVC)
  endif()

  option(EIGEN_TEST_FMA "Enable/Disable FMA in tests/examples" OFF)
-  if(EIGEN_TEST_FMA)
+  if(EIGEN_TEST_FMA AND NOT EIGEN_TEST_NEON)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
    message(STATUS "Enabling FMA in tests/examples")
  endif()
@@ -227,7 +238,12 @@ if(NOT MSVC)

  option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
  if(EIGEN_TEST_NEON)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -mcpu=cortex-a8")
+    if(EIGEN_TEST_FMA)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon-vfpv4")
+    else()
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon")
+    endif()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=softfp")
    message(STATUS "Enabling NEON in tests/examples")
  endif()

@@ -321,7 +337,7 @@ if(EIGEN_TEST_NO_EXCEPTIONS)
  message(STATUS "Disabling exceptions in tests/examples")
 endif()

-option(EIGEN_TEST_C++0x "Enables all C++0x features." OFF)
+option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tensor module)." OFF)

 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})

--- a/Eigen/Core
+++ b/Eigen/Core
@@ -24,9 +24,15 @@
  #ifdef EIGEN_INTERNAL_DEBUGGING
  #undef EIGEN_INTERNAL_DEBUGGING
  #endif
-  
+
  // Do not try to vectorize on CUDA!
+  #ifndef EIGEN_DONT_VECTORIZE
  #define EIGEN_DONT_VECTORIZE
+  #endif
+
+  #ifdef EIGEN_EXCEPTIONS
+  #undef EIGEN_EXCEPTIONS
+  #endif
  
  // All functions callable from CUDA code must be qualified with __device__
  #define EIGEN_DEVICE_FUNC __host__ __device__
@@ -67,9 +73,9 @@
 // and inclusion of their respective header files
 #include "src/Core/util/MKL_support.h"

-// if alignment is disabled, then disable vectorization. Note: EIGEN_ALIGN is the proper check, it takes into
-// account both the user's will (EIGEN_DONT_ALIGN) and our own platform checks
-#if !EIGEN_ALIGN
+// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
+// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
+#if EIGEN_MAX_ALIGN_BYTES==0
  #ifndef EIGEN_DONT_VECTORIZE
    #define EIGEN_DONT_VECTORIZE
  #endif
@@ -125,6 +131,12 @@
      #define EIGEN_VECTORIZE_SSE4_1
      #define EIGEN_VECTORIZE_SSE4_2
    #endif
+    #ifdef __AVX2__
+      #define EIGEN_VECTORIZE_AVX2
+    #endif
+    #ifdef __FMA__
+      #define EIGEN_VECTORIZE_FMA
+    #endif

    // include files

@@ -178,7 +190,7 @@
    #undef bool
    #undef vector
    #undef pixel
-  #elif defined  __ARM_NEON
+  #elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
    #define EIGEN_VECTORIZE
    #define EIGEN_VECTORIZE_NEON
    #include <arm_neon.h>
@@ -294,15 +306,19 @@ using std::ptrdiff_t;
  // Use AVX for floats and doubles, SSE for integers
  #include "src/Core/arch/SSE/PacketMath.h"
  #include "src/Core/arch/SSE/Complex.h"
+  #include "src/Core/arch/SSE/MathFunctions.h"
  #include "src/Core/arch/AVX/PacketMath.h"
  #include "src/Core/arch/AVX/MathFunctions.h"
  #include "src/Core/arch/AVX/Complex.h"
+  #include "src/Core/arch/AVX/TypeCasting.h"
 #elif defined EIGEN_VECTORIZE_SSE
  #include "src/Core/arch/SSE/PacketMath.h"
  #include "src/Core/arch/SSE/MathFunctions.h"
  #include "src/Core/arch/SSE/Complex.h"
+  #include "src/Core/arch/SSE/TypeCasting.h"
 #elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
  #include "src/Core/arch/AltiVec/PacketMath.h"
+  #include "src/Core/arch/AltiVec/MathFunctions.h"
  #include "src/Core/arch/AltiVec/Complex.h"
 #elif defined EIGEN_VECTORIZE_NEON
  #include "src/Core/arch/NEON/PacketMath.h"
@@ -343,7 +359,6 @@ using std::ptrdiff_t;
 #include "src/Core/NestByValue.h"

 // #include "src/Core/ForceAlignedAccess.h"
-// #include "src/Core/Flagged.h"

 #include "src/Core/ReturnByValue.h"
 #include "src/Core/NoAlias.h"
@@ -375,7 +390,6 @@ using std::ptrdiff_t;
 #include "src/Core/IO.h"
 #include "src/Core/Swap.h"
 #include "src/Core/CommaInitializer.h"
-#include "src/Core/ProductBase.h"
 #include "src/Core/GeneralProduct.h"
 #include "src/Core/Solve.h"
 #include "src/Core/Inverse.h"
--- a/Eigen/Geometry
+++ b/Eigen/Geometry
@@ -9,10 +9,6 @@
 #include "LU"
 #include <limits>

-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
 /** \defgroup Geometry_Module Geometry module
  *
  *
--- a/Eigen/IterativeLinearSolvers
+++ b/Eigen/IterativeLinearSolvers
@@ -12,24 +12,26 @@
  * This module currently provides iterative methods to solve problems of the form \c A \c x = \c b, where \c A is a squared matrix, usually very large and sparse.
  * Those solvers are accessible via the following classes:
  *  - ConjugateGradient for selfadjoint (hermitian) matrices,
+  *  - LeastSquaresConjugateGradient for rectangular least-square problems,
  *  - BiCGSTAB for general square matrices.
  *
  * These iterative solvers are associated with some preconditioners:
  *  - IdentityPreconditioner - not really useful
-  *  - DiagonalPreconditioner - also called JAcobi preconditioner, work very well on diagonal dominant matrices.
-  *  - IncompleteILUT - incomplete LU factorization with dual thresholding
+  *  - DiagonalPreconditioner - also called Jacobi preconditioner, work very well on diagonal dominant matrices.
+  *  - IncompleteLUT - incomplete LU factorization with dual thresholding
  *
  * Such problems can also be solved using the direct sparse decomposition modules: SparseCholesky, CholmodSupport, UmfPackSupport, SuperLUSupport.
  *
-  * \code
-  * #include <Eigen/IterativeLinearSolvers>
-  * \endcode
+    \code
+    #include <Eigen/IterativeLinearSolvers>
+    \endcode
  */

 #include "src/IterativeLinearSolvers/SolveWithGuess.h"
 #include "src/IterativeLinearSolvers/IterativeSolverBase.h"
 #include "src/IterativeLinearSolvers/BasicPreconditioners.h"
 #include "src/IterativeLinearSolvers/ConjugateGradient.h"
+#include "src/IterativeLinearSolvers/LeastSquareConjugateGradient.h"
 #include "src/IterativeLinearSolvers/BiCGSTAB.h"
 #include "src/IterativeLinearSolvers/IncompleteLUT.h"

--- a/Eigen/Sparse
+++ b/Eigen/Sparse
@@ -11,9 +11,9 @@
  * - \ref SparseQR_Module
  * - \ref IterativeLinearSolvers_Module
  *
-  * \code
-  * #include <Eigen/Sparse>
-  * \endcode
+    \code
+    #include <Eigen/Sparse>
+    \endcode
  */

 #include "SparseCore"
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -226,6 +226,11 @@ template<typename _MatrixType, int _UpLo> class LDLT
    #endif

  protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }

    /** \internal
      * Used to compute and store the Cholesky decomposition A = L D L^* = U^* D U.
@@ -309,9 +314,9 @@ template<> struct ldlt_inplace<Lower>
      }
      
      // In some previous versions of Eigen (e.g., 3.2.1), the scaling was omitted if the pivot
-      // was smaller than the cutoff value. However, soince LDLT is not rank-revealing
-      // we should only make sure we do not introduce INF or NaN values.
-      // LAPACK also uses 0 as the cutoff value.
+      // was smaller than the cutoff value. However, since LDLT is not rank-revealing
+      // we should only make sure that we do not introduce INF or NaN values.
+      // Remark that LAPACK also uses 0 as the cutoff value.
      RealScalar realAkk = numext::real(mat.coeffRef(k,k));
      if((rs>0) && (abs(realAkk) > RealScalar(0)))
        A21 /= realAkk;
@@ -424,6 +429,8 @@ template<typename MatrixType> struct LDLT_Traits<MatrixType,Upper>
 template<typename MatrixType, int _UpLo>
 LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const MatrixType& a)
 {
+  check_template_parameters();
+  
  eigen_assert(a.rows()==a.cols());
  const Index size = a.rows();

@@ -447,7 +454,7 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const MatrixType& a)
  */
 template<typename MatrixType, int _UpLo>
 template<typename Derived>
-LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Derived>& w, const typename NumTraits<typename MatrixType::Scalar>::Real& sigma)
+LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Derived>& w, const typename LDLT<MatrixType,_UpLo>::RealScalar& sigma)
 {
  typedef typename TranspositionType::StorageIndex IndexType;
  const Index size = w.rows();
@@ -490,9 +497,9 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons
  const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
  // In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon
  // as motivated by LAPACK's xGELSS:
-  // RealScalar tolerance = numext::maxi(vectorD.array().abs().maxCoeff() *NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
+  // RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
  // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
-  // diagonal element is not well justified and to numerical issues in some cases.
+  // diagonal element is not well justified and leads to numerical issues in some cases.
  // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
  RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
  
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -170,6 +170,12 @@ template<typename _MatrixType, int _UpLo> class LLT
    #endif

  protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
    /** \internal
      * Used to compute and store L
      * The strict upper part is not used and even not initialized.
@@ -377,6 +383,8 @@ template<typename MatrixType> struct LLT_Traits<MatrixType,Upper>
 template<typename MatrixType, int _UpLo>
 LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const MatrixType& a)
 {
+  check_template_parameters();
+  
  eigen_assert(a.rows()==a.cols());
  const Index size = a.rows();
  m_matrix.resize(size, size);
--- a/Eigen/src/Cholesky/LLT_MKL.h
+++ b/Eigen/src/Cholesky/LLT_MKL.h
@@ -60,7 +60,7 @@ template<> struct mkl_llt<EIGTYPE> \
    lda = m.outerStride(); \
 \
    info = LAPACKE_##MKLPREFIX##potrf( matrix_order, uplo, size, (MKLTYPE*)a, lda ); \
-    info = (info==0) ? Success : NumericalIssue; \
+    info = (info==0) ? -1 : info>0 ? info-1 : size; \
    return info; \
  } \
 }; \
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -277,6 +277,7 @@ class CholmodBase : public SparseSolverBase<Derived>
      if(!x_cd)
      {
        this->m_info = NumericalIssue;
+        return;
      }
      // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
      dest = Matrix<Scalar,Dest::RowsAtCompileTime,Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),b.rows(),b.cols());
@@ -298,6 +299,7 @@ class CholmodBase : public SparseSolverBase<Derived>
      if(!x_cs)
      {
        this->m_info = NumericalIssue;
+        return;
      }
      // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
      dest = viewAsEigen<DestScalar,DestOptions,DestIndex>(*x_cs);
@@ -367,7 +369,7 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl
    CholmodSimplicialLLT(const MatrixType& matrix) : Base()
    {
      init();
-      compute(matrix);
+      this->compute(matrix);
    }

    ~CholmodSimplicialLLT() {}
@@ -414,7 +416,7 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp
    CholmodSimplicialLDLT(const MatrixType& matrix) : Base()
    {
      init();
-      compute(matrix);
+      this->compute(matrix);
    }

    ~CholmodSimplicialLDLT() {}
@@ -459,7 +461,7 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
    CholmodSupernodalLLT(const MatrixType& matrix) : Base()
    {
      init();
-      compute(matrix);
+      this->compute(matrix);
    }

    ~CholmodSupernodalLLT() {}
@@ -506,7 +508,7 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
    CholmodDecomposition(const MatrixType& matrix) : Base()
    {
      init();
-      compute(matrix);
+      this->compute(matrix);
    }

    ~CholmodDecomposition() {}
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -24,6 +24,9 @@ namespace Eigen {
  * API for the %Matrix class provides easy access to linear-algebra
  * operations.
  *
+  * See documentation of class Matrix for detailed information on the template parameters
+  * storage layout.
+  * 
  * This class can be extended with the help of the plugin mechanism described on the page
  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_ARRAY_PLUGIN.
  *
@@ -74,7 +77,7 @@ class Array
    {
      return Base::operator=(other);
    }
-    
+
    /** Set all the entries to \a value.
      * \sa DenseBase::setConstant(), DenseBase::fill()
      */
@@ -101,7 +104,7 @@ class Array
      */
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Array& operator=(const ArrayBase<OtherDerived>& other)
+    EIGEN_STRONG_INLINE Array& operator=(const DenseBase<OtherDerived>& other)
    {
      return Base::_set(other);
    }
@@ -145,6 +148,7 @@ class Array
 #endif

 #ifdef EIGEN_HAVE_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
    Array(Array&& other)
      : Base(std::move(other))
    {
@@ -152,6 +156,7 @@ class Array
      if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
        Base::_set_noalias(other);
    }
+    EIGEN_DEVICE_FUNC
    Array& operator=(Array&& other)
    {
      other.swap(*this);
@@ -220,43 +225,18 @@ class Array
      m_storage.data()[3] = val3;
    }

-    /** Constructor copying the value of the expression \a other */
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Array(const ArrayBase<OtherDerived>& other)
-             : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
    /** Copy constructor */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const Array& other)
-            : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
-    /** Copy constructor with in-place evaluation */
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Array(const ReturnByValue<OtherDerived>& other)
-    {
-      Base::_check_template_params();
-      Base::resize(other.rows(), other.cols());
-      other.evalTo(*this);
-    }
+            : Base(other)
+    { }

    /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other)
-      : Base(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
-    {
-      Base::_check_template_params();
-      Base::_resize_to_match(other);
-      *this = other;
-    }
+      : Base(other.derived())
+    { }

    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@@ -83,22 +83,10 @@ template<typename Derived> class ArrayBase
 #endif // not EIGEN_PARSED_BY_DOXYGEN

 #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** \internal the plain matrix type corresponding to this expression. Note that is not necessarily
-      * exactly the return type of eval(): in the case of plain matrices, the return type of eval() is a const
-      * reference to a matrix, not a matrix! It is however guaranteed that the return type of eval() is either
-      * PlainObject or const PlainObject&.
-      */
-    typedef Array<typename internal::traits<Derived>::Scalar,
-                internal::traits<Derived>::RowsAtCompileTime,
-                internal::traits<Derived>::ColsAtCompileTime,
-                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
-                internal::traits<Derived>::MaxRowsAtCompileTime,
-                internal::traits<Derived>::MaxColsAtCompileTime
-          > PlainObject;
-
+    typedef typename Base::PlainObject PlainObject;

    /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
 #endif // not EIGEN_PARSED_BY_DOXYGEN

 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@@ -52,7 +52,7 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
                       const Scalar
                     >::type ScalarWithConstIfNotLvalue;

-    typedef typename internal::nested<ExpressionType>::type NestedExpressionType;
+    typedef typename internal::ref_selector<ExpressionType>::type NestedExpressionType;

    EIGEN_DEVICE_FUNC
    explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
@@ -149,7 +149,7 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index,Index)*/
    EIGEN_DEVICE_FUNC
-    void resize(Index nbRows, Index nbCols) { m_expression.const_cast_derived().resize(nbRows,nbCols); }
+    void resize(Index rows, Index cols) { m_expression.const_cast_derived().resize(rows,cols); }

  protected:
    NestedExpressionType m_expression;
@@ -195,10 +195,10 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
                       const Scalar
                     >::type ScalarWithConstIfNotLvalue;

-    typedef typename internal::nested<ExpressionType>::type NestedExpressionType;
+    typedef typename internal::ref_selector<ExpressionType>::type NestedExpressionType;

    EIGEN_DEVICE_FUNC
-    explicit inline MatrixWrapper(ExpressionType& a_matrix) : m_expression(a_matrix) {}
+    explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {}

    EIGEN_DEVICE_FUNC
    inline Index rows() const { return m_expression.rows(); }
@@ -288,7 +288,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index,Index)*/
    EIGEN_DEVICE_FUNC
-    void resize(Index nbRows, Index nbCols) { m_expression.const_cast_derived().resize(nbRows,nbCols); }
+    void resize(Index rows, Index cols) { m_expression.const_cast_derived().resize(rows,cols); }

  protected:
    NestedExpressionType m_expression;
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -28,18 +28,22 @@ template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc>
 struct copy_using_evaluator_traits
 {
  typedef typename DstEvaluator::XprType Dst;
+  typedef typename Dst::Scalar DstScalar;
+  // TODO distinguish between linear traversal and inner-traversals
+  typedef typename find_best_packet<DstScalar,Dst::SizeAtCompileTime>::type PacketType; 
  
  enum {
    DstFlags = DstEvaluator::Flags,
-    SrcFlags = SrcEvaluator::Flags
+    SrcFlags = SrcEvaluator::Flags,
+    RequiredAlignment = unpacket_traits<PacketType>::alignment
  };
  
 public:
  enum {
-    DstIsAligned = DstFlags & AlignedBit,
+    DstAlignment = DstEvaluator::Alignment,
+    SrcAlignment = SrcEvaluator::Alignment,
    DstHasDirectAccess = DstFlags & DirectAccessBit,
-    SrcIsAligned = SrcFlags & AlignedBit,
-    JointAlignment = bool(DstIsAligned) && bool(SrcIsAligned) ? Aligned : Unaligned
+    JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment)
  };

 private:
@@ -51,7 +55,7 @@ private:
              : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime)
              : int(Dst::MaxRowsAtCompileTime),
    MaxSizeAtCompileTime = Dst::SizeAtCompileTime,
-    PacketSize = packet_traits<typename Dst::Scalar>::size
+    PacketSize = unpacket_traits<PacketType>::size
  };

  enum {
@@ -62,10 +66,10 @@ private:
                  && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit)
                  && (functor_traits<AssignFunc>::PacketAccess),
    MayInnerVectorize  = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
-                       && int(DstIsAligned) && int(SrcIsAligned),
+                       && int(JointAlignment)>=int(RequiredAlignment),
    MayLinearize = StorageOrdersAgree && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
    MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
-                       && (DstIsAligned || MaxSizeAtCompileTime == Dynamic),
+                       && ((int(DstAlignment)>=int(RequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
         so it's only good for large enough sizes. */
    MaySliceVectorize  = MightVectorize && DstHasDirectAccess
@@ -107,8 +111,8 @@ public:
                                             : int(NoUnrolling)
                  )
              : int(Traversal) == int(LinearVectorizedTraversal)
-                ? ( bool(MayUnrollCompletely) && bool(DstIsAligned) ? int(CompleteUnrolling) 
-                                                                    : int(NoUnrolling) )
+                ? ( bool(MayUnrollCompletely) && (int(DstAlignment)>=int(RequiredAlignment)) ? int(CompleteUnrolling)
+                                                                                             : int(NoUnrolling) )
              : int(Traversal) == int(LinearTraversal)
                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) 
                                              : int(NoUnrolling) )
@@ -124,8 +128,9 @@ public:
    EIGEN_DEBUG_VAR(DstFlags)
    EIGEN_DEBUG_VAR(SrcFlags)
    std::cerr.unsetf(std::ios::hex);
-    EIGEN_DEBUG_VAR(DstIsAligned)
-    EIGEN_DEBUG_VAR(SrcIsAligned)
+    EIGEN_DEBUG_VAR(DstAlignment)
+    EIGEN_DEBUG_VAR(SrcAlignment)
+    EIGEN_DEBUG_VAR(RequiredAlignment)
    EIGEN_DEBUG_VAR(JointAlignment)
    EIGEN_DEBUG_VAR(InnerSize)
    EIGEN_DEBUG_VAR(InnerMaxSize)
@@ -225,6 +230,7 @@ struct copy_using_evaluator_innervec_CompleteUnrolling
  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
  typedef typename DstEvaluatorType::XprType DstXprType;
+  typedef typename Kernel::PacketType PacketType;
  
  enum {
    outer = Index / DstXprType::InnerSizeAtCompileTime,
@@ -234,8 +240,8 @@ struct copy_using_evaluator_innervec_CompleteUnrolling

  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
-    kernel.template assignPacketByOuterInner<Aligned, JointAlignment>(outer, inner);
-    enum { NextIndex = Index + packet_traits<typename DstXprType::Scalar>::size };
+    kernel.template assignPacketByOuterInner<Aligned, JointAlignment, PacketType>(outer, inner);
+    enum { NextIndex = Index + unpacket_traits<PacketType>::size };
    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel);
  }
 };
@@ -249,10 +255,11 @@ struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop>
 template<typename Kernel, int Index_, int Stop>
 struct copy_using_evaluator_innervec_InnerUnrolling
 {
+  typedef typename Kernel::PacketType PacketType;
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
  {
-    kernel.template assignPacketByOuterInner<Aligned, Aligned>(outer, Index_);
-    enum { NextIndex = Index_ + packet_traits<typename Kernel::Scalar>::size };
+    kernel.template assignPacketByOuterInner<Aligned, Aligned, PacketType>(outer, Index_);
+    enum { NextIndex = Index_ + unpacket_traits<PacketType>::size };
    copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop>::run(kernel, outer);
  }
 };
@@ -360,20 +367,23 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    const Index size = kernel.size();
-    typedef packet_traits<typename Kernel::Scalar> PacketTraits;
+    typedef typename Kernel::Scalar Scalar;
+    typedef typename Kernel::PacketType PacketType;
    enum {
-      packetSize = PacketTraits::size,
-      dstIsAligned = int(Kernel::AssignmentTraits::DstIsAligned),
-      dstAlignment = PacketTraits::AlignedOnScalar ? Aligned : dstIsAligned,
+      requestedAlignment = Kernel::AssignmentTraits::RequiredAlignment,
+      packetSize = unpacket_traits<PacketType>::size,
+      dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
+      dstAlignment = packet_traits<Scalar>::AlignedOnScalar ? int(requestedAlignment)
+                                                            : int(Kernel::AssignmentTraits::DstAlignment),
      srcAlignment = Kernel::AssignmentTraits::JointAlignment
    };
-    const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned(&kernel.dstEvaluator().coeffRef(0), size);
+    const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(&kernel.dstEvaluator().coeffRef(0), size);
    const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;

    unaligned_dense_assignment_loop<dstIsAligned!=0>::run(kernel, 0, alignedStart);

    for(Index index = alignedStart; index < alignedEnd; index += packetSize)
-      kernel.template assignPacket<dstAlignment, srcAlignment>(index);
+      kernel.template assignPacket<dstAlignment, srcAlignment, PacketType>(index);

    unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
  }
@@ -403,14 +413,15 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrollin
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling>
 {
+  typedef typename Kernel::PacketType PacketType;
  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
  {
    const Index innerSize = kernel.innerSize();
    const Index outerSize = kernel.outerSize();
-    const Index packetSize = packet_traits<typename Kernel::Scalar>::size;
+    const Index packetSize = unpacket_traits<PacketType>::size;
    for(Index outer = 0; outer < outerSize; ++outer)
      for(Index inner = 0; inner < innerSize; inner+=packetSize)
-        kernel.template assignPacketByOuterInner<Aligned, Aligned>(outer, inner);
+        kernel.template assignPacketByOuterInner<Aligned, Aligned, PacketType>(outer, inner);
  }
 };

@@ -471,18 +482,27 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
 {
  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
  {
-    typedef packet_traits<typename Kernel::Scalar> PacketTraits;
+    typedef typename Kernel::Scalar Scalar;
+    typedef typename Kernel::PacketType PacketType;
    enum {
-      packetSize = PacketTraits::size,
-      alignable = PacketTraits::AlignedOnScalar,
-      dstAlignment = alignable ? Aligned : int(Kernel::AssignmentTraits::DstIsAligned)
+      packetSize = unpacket_traits<PacketType>::size,
+      requestedAlignment = int(Kernel::AssignmentTraits::RequiredAlignment),
+      alignable = packet_traits<Scalar>::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment)>=sizeof(Scalar),
+      dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
+      dstAlignment = alignable ? int(requestedAlignment)
+                               : int(Kernel::AssignmentTraits::DstAlignment)
    };
+    const Scalar *dst_ptr = &kernel.dstEvaluator().coeffRef(0,0);
+    if((!bool(dstIsAligned)) && (size_t(dst_ptr) % sizeof(Scalar))>0)
+    {
+      // the pointer is not aligend-on scalar, so alignment is not possible
+      return dense_assignment_loop<Kernel,DefaultTraversal,NoUnrolling>::run(kernel);
+    }
    const Index packetAlignedMask = packetSize - 1;
    const Index innerSize = kernel.innerSize();
    const Index outerSize = kernel.outerSize();
    const Index alignedStep = alignable ? (packetSize - kernel.outerStride() % packetSize) & packetAlignedMask : 0;
-    Index alignedStart = ((!alignable) || Kernel::AssignmentTraits::DstIsAligned) ? 0
-                       : internal::first_aligned(&kernel.dstEvaluator().coeffRef(0,0), innerSize);
+    Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned<requestedAlignment>(dst_ptr, innerSize);

    for(Index outer = 0; outer < outerSize; ++outer)
    {
@@ -493,7 +513,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>

      // do the vectorizable part of the assignment
      for(Index inner = alignedStart; inner<alignedEnd; inner+=packetSize)
-        kernel.template assignPacketByOuterInner<dstAlignment, Unaligned>(outer, inner);
+        kernel.template assignPacketByOuterInner<dstAlignment, Unaligned, PacketType>(outer, inner);

      // do the non-vectorizable part of the assignment
      for(Index inner = alignedEnd; inner<innerSize ; ++inner)
@@ -527,6 +547,7 @@ public:
  typedef typename DstEvaluatorType::Scalar Scalar;
  typedef typename DstEvaluatorType::StorageIndex StorageIndex;
  typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits;
+  typedef typename AssignmentTraits::PacketType PacketType;
  
  
  EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
@@ -571,24 +592,24 @@ public:
  }
  
  
-  template<int StoreMode, int LoadMode>
+  template<int StoreMode, int LoadMode, typename PacketType>
  EIGEN_DEVICE_FUNC void assignPacket(Index row, Index col)
  {
-    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode>(row,col));
+    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode,PacketType>(row,col));
  }
  
-  template<int StoreMode, int LoadMode>
+  template<int StoreMode, int LoadMode, typename PacketType>
  EIGEN_DEVICE_FUNC void assignPacket(Index index)
  {
-    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode>(index));
+    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode,PacketType>(index));
  }
  
-  template<int StoreMode, int LoadMode>
+  template<int StoreMode, int LoadMode, typename PacketType>
  EIGEN_DEVICE_FUNC void assignPacketByOuterInner(Index outer, Index inner)
  {
    Index row = rowIndexByOuterInner(outer, inner); 
    Index col = colIndexByOuterInner(outer, inner);
-    assignPacket<StoreMode,LoadMode>(row, col);
+    assignPacket<StoreMode,LoadMode,PacketType>(row, col);
  }
  
  EIGEN_DEVICE_FUNC static Index rowIndexByOuterInner(Index outer, Index inner)
@@ -626,8 +647,8 @@ EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const S
 {
  eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
  
-  typedef typename evaluator<DstXprType>::type DstEvaluatorType;
-  typedef typename evaluator<SrcXprType>::type SrcEvaluatorType;
+  typedef evaluator<DstXprType> DstEvaluatorType;
+  typedef evaluator<SrcXprType> SrcEvaluatorType;

  DstEvaluatorType dstEvaluator(dst);
  SrcEvaluatorType srcEvaluator(src);
@@ -749,6 +770,26 @@ EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src)
  call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar>());
 }

+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func)
+{
+  Index dstRows = src.rows();
+  Index dstCols = src.cols();
+  if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+    dst.resize(dstRows, dstCols);
+  
+  // TODO check whether this is the right place to perform these checks:
+  EIGEN_STATIC_ASSERT_LVALUE(Dst)
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src)
+  
+  Assignment<Dst,Src,Func>::run(dst, src, func);
+}
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src)
+{
+  call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar>());
+}
+
 // forward declaration
 template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, const Src &src);

@@ -776,7 +817,6 @@ struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Scalar>
  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
  {
    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
-    
    src.evalTo(dst);
  }
 };
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@@ -1,6 +1,7 @@
 /*
 Copyright (c) 2011, Intel Corporation. All rights reserved.
-
+ Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+ 
 Redistribution and use in source and binary forms, with or without modification,
 are permitted provided that the following conditions are met:

@@ -37,17 +38,13 @@ namespace Eigen {

 namespace internal {

-template<typename Op> struct vml_call
-{ enum { IsSupported = 0 }; };
-
-template<typename Dst, typename Src, typename UnaryOp>
+template<typename Dst, typename Src>
 class vml_assign_traits
 {
  private:
    enum {
      DstHasDirectAccess = Dst::Flags & DirectAccessBit,
      SrcHasDirectAccess = Src::Flags & DirectAccessBit,
-
      StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Src::IsRowMajor)),
      InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime)
                : int(Dst::Flags)&RowMajorBit ? int(Dst::ColsAtCompileTime)
@@ -57,165 +54,118 @@ class vml_assign_traits
                    : int(Dst::MaxRowsAtCompileTime),
      MaxSizeAtCompileTime = Dst::SizeAtCompileTime,

-      MightEnableVml =  vml_call<UnaryOp>::IsSupported && StorageOrdersAgree && DstHasDirectAccess && SrcHasDirectAccess
-                     && Src::InnerStrideAtCompileTime==1 && Dst::InnerStrideAtCompileTime==1,
+      MightEnableVml = StorageOrdersAgree && DstHasDirectAccess && SrcHasDirectAccess && Src::InnerStrideAtCompileTime==1 && Dst::InnerStrideAtCompileTime==1,
      MightLinearize = MightEnableVml && (int(Dst::Flags) & int(Src::Flags) & LinearAccessBit),
      VmlSize = MightLinearize ? MaxSizeAtCompileTime : InnerMaxSize,
-      LargeEnough = VmlSize==Dynamic || VmlSize>=EIGEN_MKL_VML_THRESHOLD,
-      MayEnableVml = MightEnableVml && LargeEnough,
-      MayLinearize = MayEnableVml && MightLinearize
+      LargeEnough = VmlSize==Dynamic || VmlSize>=EIGEN_MKL_VML_THRESHOLD
    };
  public:
    enum {
-      Traversal = MayLinearize ? LinearVectorizedTraversal
-                : MayEnableVml ? InnerVectorizedTraversal
-                : DefaultTraversal
+      EnableVml = MightEnableVml && LargeEnough,
+      Traversal = MightLinearize ? LinearTraversal : DefaultTraversal
    };
 };

-template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling,
-         int VmlTraversal = vml_assign_traits<Derived1, Derived2, UnaryOp>::Traversal >
-struct vml_assign_impl
-  : assign_impl<Derived1, Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>
-{
-};
-
-template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling>
-struct vml_assign_impl<Derived1, Derived2, UnaryOp, Traversal, Unrolling, InnerVectorizedTraversal>
-{
-  typedef typename Derived1::Scalar Scalar;
-  static inline void run(Derived1& dst, const CwiseUnaryOp<UnaryOp, Derived2>& src)
-  {
-    // in case we want to (or have to) skip VML at runtime we can call:
-    // assign_impl<Derived1,Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>::run(dst,src);
-    const Index innerSize = dst.innerSize();
-    const Index outerSize = dst.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer) {
-      const Scalar *src_ptr = src.IsRowMajor ?  &(src.nestedExpression().coeffRef(outer,0)) :
-                                                &(src.nestedExpression().coeffRef(0, outer));
-      Scalar *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));
-      vml_call<UnaryOp>::run(src.functor(), innerSize, src_ptr, dst_ptr );
-    }
-  }
-};
-
-template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling>
-struct vml_assign_impl<Derived1, Derived2, UnaryOp, Traversal, Unrolling, LinearVectorizedTraversal>
-{
-  static inline void run(Derived1& dst, const CwiseUnaryOp<UnaryOp, Derived2>& src)
-  {
-    // in case we want to (or have to) skip VML at runtime we can call:
-    // assign_impl<Derived1,Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>::run(dst,src);
-    vml_call<UnaryOp>::run(src.functor(), dst.size(), src.nestedExpression().data(), dst.data() );
-  }
-};
-
-// Macroses
-
-#define EIGEN_MKL_VML_SPECIALIZE_ASSIGN(TRAVERSAL,UNROLLING) \
-  template<typename Derived1, typename Derived2, typename UnaryOp> \
-  struct assign_impl<Derived1, Eigen::CwiseUnaryOp<UnaryOp, Derived2>, TRAVERSAL, UNROLLING, Specialized>  {  \
-    static inline void run(Derived1 &dst, const Eigen::CwiseUnaryOp<UnaryOp, Derived2> &src) { \
-      vml_assign_impl<Derived1,Derived2,UnaryOp,TRAVERSAL,UNROLLING>::run(dst, src); \
-    } \
-  };
-
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,InnerUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,InnerUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearVectorizedTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearVectorizedTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(SliceVectorizedTraversal,NoUnrolling)
-
-
+#define EIGEN_PP_EXPAND(ARG) ARG
 #if !defined (EIGEN_FAST_MATH) || (EIGEN_FAST_MATH != 1)
-#define  EIGEN_MKL_VML_MODE VML_HA
+#define EIGEN_VMLMODE_EXPAND_LA , VML_HA
 #else
-#define  EIGEN_MKL_VML_MODE VML_LA
+#define EIGEN_VMLMODE_EXPAND_LA , VML_LA
 #endif

-#define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)     \
-  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
-    enum { IsSupported = 1 };                                                    \
-    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& /*func*/,        \
-                            int size, const EIGENTYPE* src, EIGENTYPE* dst) {    \
-      VMLOP(size, (const VMLTYPE*)src, (VMLTYPE*)dst);                           \
-    }                                                                            \
+#define EIGEN_VMLMODE_EXPAND__ 
+
+#define EIGEN_VMLMODE_PREFIX_LA vm
+#define EIGEN_VMLMODE_PREFIX__  v
+#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_,VMLMODE)
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
+  template< typename DstXprType, typename SrcXprNested>                                                                         \
+  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE>,             \
+                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml,EIGENTYPE>::type> {    \
+    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                            \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE> &/*func*/) {                             \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                       \
+      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) {                                              \
+        VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(),                                                        \
+              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) );                                           \
+      } else {                                                                                                                  \
+        const Index outerSize = dst.outerSize();                                                                                \
+        for(Index outer = 0; outer < outerSize; ++outer) {                                                                      \
+          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.nestedExpression().coeffRef(outer,0)) :                             \
+                                                      &(src.nestedExpression().coeffRef(0, outer));                             \
+          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                           \
+          VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr,                                                                      \
+                (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE));                                             \
+        }                                                                                                                       \
+      }                                                                                                                         \
+    }                                                                                                                           \
+  };                                                                                                                            \
+
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP, VMLMODE)                                                         \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),s##VMLOP), float, float, VMLMODE)           \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),d##VMLOP), double, double, VMLMODE)
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(EIGENOP, VMLOP, VMLMODE)                                                         \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),c##VMLOP), scomplex, MKL_Complex8, VMLMODE) \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),z##VMLOP), dcomplex, MKL_Complex16, VMLMODE)
+  
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS(EIGENOP, VMLOP, VMLMODE)                                                              \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP, VMLMODE)                                                               \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(EIGENOP, VMLOP, VMLMODE)
+
+  
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sin,   Sin,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(asin,  Asin,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sinh,  Sinh,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(cos,   Cos,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(acos,  Acos,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(cosh,  Cosh,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(tan,   Tan,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(atan,  Atan,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(tanh,  Tanh,  LA)
+// EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs,   Abs,    _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(exp,   Exp,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(log,   Ln,    LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(log10, Log10, LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sqrt,  Sqrt,  _)
+
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(square, Sqr,   _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(arg, Arg,      _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(round, Round,  _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(floor, Floor,  _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)
+
+#define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
+  template< typename DstXprType, typename SrcXprNested>                                                                       \
+  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE>,           \
+                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml,EIGENTYPE>::type> {  \
+    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                          \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE> &/*func*/) {                           \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                     \
+      VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.functor().m_exponent);                                          \
+      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal)                                              \
+      {                                                                                                                       \
+        VMLOP( dst.size(), (const VMLTYPE*)src.nestedExpression().data(), exponent,                                           \
+              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) );                                         \
+      } else {                                                                                                                \
+        const Index outerSize = dst.outerSize();                                                                              \
+        for(Index outer = 0; outer < outerSize; ++outer) {                                                                    \
+          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.nestedExpression().coeffRef(outer,0)) :                           \
+                                                      &(src.nestedExpression().coeffRef(0, outer));                           \
+          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                         \
+          VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, exponent,                                                          \
+                 (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE));                                          \
+        }                                                                                                                     \
+      }                                                                                                                       \
+    }                                                                                                                         \
  };
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)  \
-  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
-    enum { IsSupported = 1 };                                                    \
-    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& /*func*/,        \
-                            int size, const EIGENTYPE* src, EIGENTYPE* dst) {    \
-      MKL_INT64 vmlMode = EIGEN_MKL_VML_MODE;                                    \
-      VMLOP(size, (const VMLTYPE*)src, (VMLTYPE*)dst, vmlMode);                  \
-    }                                                                            \
-  };
-
-#define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)       \
-  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
-    enum { IsSupported = 1 };                                                    \
-    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& func,        \
-                          int size, const EIGENTYPE* src, EIGENTYPE* dst) {      \
-      EIGENTYPE exponent = func.m_exponent;                                      \
-      MKL_INT64 vmlMode = EIGEN_MKL_VML_MODE;                                    \
-      VMLOP(&size, (const VMLTYPE*)src, (const VMLTYPE*)&exponent,               \
-                        (VMLTYPE*)dst, &vmlMode);                                \
-    }                                                                            \
-  };
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP)                   \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vs##VMLOP, float, float)             \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vd##VMLOP, double, double)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX(EIGENOP, VMLOP)                \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vc##VMLOP, scomplex, MKL_Complex8)   \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vz##VMLOP, dcomplex, MKL_Complex16)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS(EIGENOP, VMLOP)                        \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP)                         \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX(EIGENOP, VMLOP)
-
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL_LA(EIGENOP, VMLOP)                \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vms##VMLOP, float, float)         \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmd##VMLOP, double, double)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX_LA(EIGENOP, VMLOP)             \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmc##VMLOP, scomplex, MKL_Complex8)  \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmz##VMLOP, dcomplex, MKL_Complex16)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(EIGENOP, VMLOP)                     \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL_LA(EIGENOP, VMLOP)                      \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX_LA(EIGENOP, VMLOP)
-
-
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(sin,  Sin)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(asin, Asin)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(cos,  Cos)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(acos, Acos)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(tan,  Tan)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(atan,  Atan)
-//EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs,  Abs)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(exp,  Exp)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(log,  Ln)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(sqrt, Sqrt)
-
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(square, Sqr)
-
-// The vm*powx functions are not avaibale in the windows version of MKL.
-#ifndef _WIN32
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmspowx_, float, float)
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmdpowx_, double, double)
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmcpowx_, scomplex, MKL_Complex8)
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmzpowx_, dcomplex, MKL_Complex16)
-#endif
+  
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmsPowx, float,    float,         LA)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmdPowx, double,   double,        LA)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmcPowx, scomplex, MKL_Complex8,  LA)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmzPowx, dcomplex, MKL_Complex16, LA)

 } // end namespace internal

--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -55,7 +55,7 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
  typedef typename traits<XprType>::Scalar Scalar;
  typedef typename traits<XprType>::StorageKind StorageKind;
  typedef typename traits<XprType>::XprKind XprKind;
-  typedef typename nested<XprType>::type XprTypeNested;
+  typedef typename ref_selector<XprType>::type XprTypeNested;
  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
  enum{
    MatrixRows = traits<XprType>::RowsAtCompileTime,
@@ -81,14 +81,16 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
    OuterStrideAtCompileTime = HasSameStorageOrderAsXprType
                             ? int(outer_stride_at_compile_time<XprType>::ret)
                             : int(inner_stride_at_compile_time<XprType>::ret),
-    // IsAligned is needed by MapBase's assertions
-    // We can sefely set it to false here. Internal alignment errors will be detected by an eigen_internal_assert in the respective evaluator
-    IsAligned = 0,
+
    // FIXME, this traits is rather specialized for dense object and it needs to be cleaned further
    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
    FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
-    Flags = (traits<XprType>::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit
+    Flags = (traits<XprType>::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit,
    // FIXME DirectAccessBit should not be handled by expressions
+    // 
+    // Alignment is needed by MapBase's assertions
+    // We can sefely set it to false here. Internal alignment errors will be detected by an eigen_internal_assert in the respective evaluator
+    Alignment = 0
  };
 };

@@ -124,26 +126,26 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
    /** Fixed-size constructor
      */
    EIGEN_DEVICE_FUNC
-    inline Block(XprType& xpr, Index a_startRow, Index a_startCol)
-      : Impl(xpr, a_startRow, a_startCol)
+    inline Block(XprType& xpr, Index startRow, Index startCol)
+      : Impl(xpr, startRow, startCol)
    {
      EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
-      eigen_assert(a_startRow >= 0 && BlockRows >= 1 && a_startRow + BlockRows <= xpr.rows()
-             && a_startCol >= 0 && BlockCols >= 1 && a_startCol + BlockCols <= xpr.cols());
+      eigen_assert(startRow >= 0 && BlockRows >= 1 && startRow + BlockRows <= xpr.rows()
+             && startCol >= 0 && BlockCols >= 1 && startCol + BlockCols <= xpr.cols());
    }

    /** Dynamic-size constructor
      */
    EIGEN_DEVICE_FUNC
    inline Block(XprType& xpr,
-          Index a_startRow, Index a_startCol,
+          Index startRow, Index startCol,
          Index blockRows, Index blockCols)
-      : Impl(xpr, a_startRow, a_startCol, blockRows, blockCols)
+      : Impl(xpr, startRow, startCol, blockRows, blockCols)
    {
      eigen_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==blockRows)
          && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==blockCols));
-      eigen_assert(a_startRow >= 0 && blockRows >= 0 && a_startRow  <= xpr.rows() - blockRows
-          && a_startCol >= 0 && blockCols >= 0 && a_startCol <= xpr.cols() - blockCols);
+      eigen_assert(startRow >= 0 && blockRows >= 0 && startRow  <= xpr.rows() - blockRows
+          && startCol >= 0 && blockCols >= 0 && startCol <= xpr.cols() - blockCols);
    }
 };
         
@@ -159,10 +161,10 @@ class BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, Dense>
    typedef Impl Base;
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
-    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol) : Impl(xpr, a_startRow, a_startCol) {}
+    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {}
    EIGEN_DEVICE_FUNC
-    inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol, Index blockRows, Index blockCols)
-      : Impl(xpr, a_startRow, a_startCol, blockRows, blockCols) {}
+    inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+      : Impl(xpr, startRow, startCol, blockRows, blockCols) {}
 };

 namespace internal {
@@ -198,8 +200,8 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
    /** Fixed-size constructor
      */
    EIGEN_DEVICE_FUNC
-    inline BlockImpl_dense(XprType& xpr, Index a_startRow, Index a_startCol)
-      : m_xpr(xpr), m_startRow(a_startRow), m_startCol(a_startCol),
+    inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
+      : m_xpr(xpr), m_startRow(startRow), m_startCol(startCol),
                    m_blockRows(BlockRows), m_blockCols(BlockCols)
    {}

@@ -207,9 +209,9 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
      */
    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr,
-          Index a_startRow, Index a_startCol,
+          Index startRow, Index startCol,
          Index blockRows, Index blockCols)
-      : m_xpr(xpr), m_startRow(a_startRow), m_startCol(a_startCol),
+      : m_xpr(xpr), m_startRow(startRow), m_startCol(startCol),
                    m_blockRows(blockRows), m_blockCols(blockCols)
    {}

--- a/Eigen/src/Core/BooleanRedux.h
+++ b/Eigen/src/Core/BooleanRedux.h
@@ -80,7 +80,7 @@ struct any_unroller<Derived, Dynamic>
 template<typename Derived>
 inline bool DenseBase<Derived>::all() const
 {
-  typedef typename internal::evaluator<Derived>::type Evaluator;
+  typedef internal::evaluator<Derived> Evaluator;
  enum {
    unroll = SizeAtCompileTime != Dynamic
          && Evaluator::CoeffReadCost != Dynamic
@@ -106,7 +106,7 @@ inline bool DenseBase<Derived>::all() const
 template<typename Derived>
 inline bool DenseBase<Derived>::any() const
 {
-  typedef typename internal::evaluator<Derived>::type Evaluator;
+  typedef internal::evaluator<Derived> Evaluator;
  enum {
    unroll = SizeAtCompileTime != Dynamic
          && Evaluator::CoeffReadCost != Dynamic
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h
@@ -105,6 +105,9 @@ struct CommaInitializer

  EIGEN_DEVICE_FUNC
  inline ~CommaInitializer()
+#if defined VERIFY_RAISES_ASSERT && (!defined EIGEN_NO_ASSERTION_CHECKING) && defined EIGEN_EXCEPTIONS
+  throw(Eigen::eigen_assert_exception)
+#endif
  {
    eigen_assert((m_row+m_currentBlockRows) == m_xpr.rows()
         && m_col == m_xpr.cols()
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -28,13 +28,9 @@ struct storage_kind_to_evaluator_kind {
 // It can be Dense, Sparse, Triangular, Diagonal, SelfAdjoint, Band, etc.
 template<typename StorageKind> struct storage_kind_to_shape;

-
-template<> struct storage_kind_to_shape<Dense> { typedef DenseShape Shape; };
-
-
-// FIXME Is this necessary? And why was it not before refactoring???
-template<> struct storage_kind_to_shape<PermutationStorage> { typedef PermutationShape Shape; };
-
+template<> struct storage_kind_to_shape<Dense>                  { typedef DenseShape Shape;           };
+template<> struct storage_kind_to_shape<PermutationStorage>     { typedef PermutationShape Shape;     };
+template<> struct storage_kind_to_shape<TranspositionsStorage>  { typedef TranspositionsShape Shape;  };

 // Evaluators have to be specialized with respect to various criteria such as:
 //  - storage/structure/shape
@@ -63,11 +59,6 @@ template< typename T,
 template<typename T>
 struct evaluator_traits_base
 {
-  // TODO check whether these two indirections are really needed.
-  // Basically, if nobody overwrite type and nestedType, then, they can be dropped
-//   typedef evaluator<T> type;
-//   typedef evaluator<T> nestedType;
-  
  // by default, get evaluator kind and shape from storage
  typedef typename storage_kind_to_evaluator_kind<typename traits<T>::StorageKind>::Kind Kind;
  typedef typename storage_kind_to_shape<typename traits<T>::StorageKind>::Shape Shape;
@@ -94,27 +85,28 @@ struct evaluator : public unary_evaluator<T>


 // TODO: Think about const-correctness
-
 template<typename T>
 struct evaluator<const T>
  : evaluator<T>
-{ };
-
-// ---------- base class for all writable evaluators ----------
-
-// TODO this class does not seem to be necessary anymore
-template<typename ExpressionType>
-struct evaluator_base
 {
-//   typedef typename evaluator_traits<ExpressionType>::type type;
-//   typedef typename evaluator_traits<ExpressionType>::nestedType nestedType;
-  typedef evaluator<ExpressionType> type;
-  typedef evaluator<ExpressionType> nestedType;
-  
+  EIGEN_DEVICE_FUNC
+  explicit evaluator(const T& xpr) : evaluator<T>(xpr) {}
+};
+
+// ---------- base class for all evaluators ----------
+
+template<typename ExpressionType>
+struct evaluator_base : public noncopyable
+{
  // FIXME is it really usefull?
  typedef typename traits<ExpressionType>::StorageIndex StorageIndex;
+  
  // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices.
  typedef traits<ExpressionType> ExpressionTraits;
+  
+  enum {
+    Alignment = 0
+  };
 };

 // -------------------- Matrix and Array --------------------
@@ -131,8 +123,6 @@ struct evaluator<PlainObjectBase<Derived> >
  typedef PlainObjectBase<Derived> PlainObjectType;
  typedef typename PlainObjectType::Scalar Scalar;
  typedef typename PlainObjectType::CoeffReturnType CoeffReturnType;
-  typedef typename PlainObjectType::PacketScalar PacketScalar;
-  typedef typename PlainObjectType::PacketReturnType PacketReturnType;

  enum {
    IsRowMajor = PlainObjectType::IsRowMajor,
@@ -141,8 +131,8 @@ struct evaluator<PlainObjectBase<Derived> >
    ColsAtCompileTime = PlainObjectType::ColsAtCompileTime,
    
    CoeffReadCost = NumTraits<Scalar>::ReadCost,
-    Flags = compute_matrix_evaluator_flags< Scalar,Derived::RowsAtCompileTime,Derived::ColsAtCompileTime,
-                                            Derived::Options,Derived::MaxRowsAtCompileTime,Derived::MaxColsAtCompileTime>::ret
+    Flags = traits<Derived>::EvaluatorFlags,
+    Alignment = traits<Derived>::Alignment
  };
  
  EIGEN_DEVICE_FUNC evaluator()
@@ -182,36 +172,36 @@ struct evaluator<PlainObjectBase<Derived> >
    return const_cast<Scalar*>(m_data)[index];
  }

-  template<int LoadMode> 
-  PacketReturnType packet(Index row, Index col) const
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index row, Index col) const
  {
    if (IsRowMajor)
-      return ploadt<PacketScalar, LoadMode>(m_data + row * m_outerStride.value() + col);
+      return ploadt<PacketType, LoadMode>(m_data + row * m_outerStride.value() + col);
    else
-      return ploadt<PacketScalar, LoadMode>(m_data + row + col * m_outerStride.value());
+      return ploadt<PacketType, LoadMode>(m_data + row + col * m_outerStride.value());
  }

-  template<int LoadMode> 
-  PacketReturnType packet(Index index) const
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index index) const
  {
-    return ploadt<PacketScalar, LoadMode>(m_data + index);
+    return ploadt<PacketType, LoadMode>(m_data + index);
  }

-  template<int StoreMode> 
-  void writePacket(Index row, Index col, const PacketScalar& x)
+  template<int StoreMode,typename PacketType>
+  void writePacket(Index row, Index col, const PacketType& x)
  {
    if (IsRowMajor)
-      return pstoret<Scalar, PacketScalar, StoreMode>
+      return pstoret<Scalar, PacketType, StoreMode>
 	            (const_cast<Scalar*>(m_data) + row * m_outerStride.value() + col, x);
    else
-      return pstoret<Scalar, PacketScalar, StoreMode>
+      return pstoret<Scalar, PacketType, StoreMode>
                    (const_cast<Scalar*>(m_data) + row + col * m_outerStride.value(), x);
  }

-  template<int StoreMode> 
-  void writePacket(Index index, const PacketScalar& x)
+  template<int StoreMode, typename PacketType>
+  void writePacket(Index index, const PacketType& x)
  {
-    return pstoret<Scalar, PacketScalar, StoreMode>(const_cast<Scalar*>(m_data) + index, x);
+    return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_data) + index, x);
  }

 protected:
@@ -229,7 +219,7 @@ struct evaluator<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
 {
  typedef Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
  
-  evaluator() {}
+  EIGEN_DEVICE_FUNC evaluator() {}

  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m)
    : evaluator<PlainObjectBase<XprType> >(m) 
@@ -242,7 +232,7 @@ struct evaluator<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
 {
  typedef Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;

-  evaluator() {}
+  EIGEN_DEVICE_FUNC evaluator() {}
  
  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m)
    : evaluator<PlainObjectBase<XprType> >(m) 
@@ -259,15 +249,14 @@ struct unary_evaluator<Transpose<ArgType>, IndexBased>
  
  enum {
    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,    
-    Flags = evaluator<ArgType>::Flags ^ RowMajorBit
+    Flags = evaluator<ArgType>::Flags ^ RowMajorBit,
+    Alignment = evaluator<ArgType>::Alignment
  };

  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}

  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketScalar PacketScalar;
-  typedef typename XprType::PacketReturnType PacketReturnType;

  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
  {
@@ -289,32 +278,32 @@ struct unary_evaluator<Transpose<ArgType>, IndexBased>
    return m_argImpl.coeffRef(index);
  }

-  template<int LoadMode>
-  PacketReturnType packet(Index row, Index col) const
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index row, Index col) const
  {
-    return m_argImpl.template packet<LoadMode>(col, row);
+    return m_argImpl.template packet<LoadMode,PacketType>(col, row);
  }

-  template<int LoadMode>
-  PacketReturnType packet(Index index) const
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index index) const
  {
-    return m_argImpl.template packet<LoadMode>(index);
+    return m_argImpl.template packet<LoadMode,PacketType>(index);
  }

-  template<int StoreMode> 
-  void writePacket(Index row, Index col, const PacketScalar& x)
+  template<int StoreMode, typename PacketType> 
+  void writePacket(Index row, Index col, const PacketType& x)
  {
-    m_argImpl.template writePacket<StoreMode>(col, row, x);
+    m_argImpl.template writePacket<StoreMode,PacketType>(col, row, x);
  }

-  template<int StoreMode> 
-  void writePacket(Index index, const PacketScalar& x)
+  template<int StoreMode, typename PacketType> 
+  void writePacket(Index index, const PacketType& x)
  {
-    m_argImpl.template writePacket<StoreMode>(index, x);
+    m_argImpl.template writePacket<StoreMode,PacketType>(index, x);
  }

 protected:
-  typename evaluator<ArgType>::nestedType m_argImpl;
+  evaluator<ArgType> m_argImpl;
 };

 // -------------------- CwiseNullaryOp --------------------
@@ -335,7 +324,8 @@ struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >
          &  (  HereditaryBits
              | (functor_has_linear_access<NullaryOp>::ret  ? LinearAccessBit : 0)
              | (functor_traits<NullaryOp>::PacketAccess    ? PacketAccessBit : 0)))
-          | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit) // FIXME EvalBeforeNestingBit should be needed anymore
+          | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit), // FIXME EvalBeforeNestingBit should be needed anymore
+    Alignment = 0 // FIXME alignment should not matter here, perhaps we could set it to AlignMax??
  };

  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& n)
@@ -343,7 +333,6 @@ struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >
  { }

  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketScalar PacketScalar;

  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
  {
@@ -355,16 +344,16 @@ struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >
    return m_functor(index);
  }

-  template<int LoadMode>
-  PacketScalar packet(Index row, Index col) const
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index row, Index col) const
  {
-    return m_functor.packetOp(row, col);
+    return m_functor.template packetOp<Index,PacketType>(row, col);
  }

-  template<int LoadMode>
-  PacketScalar packet(Index index) const
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index index) const
  {
-    return m_functor.packetOp(index);
+    return m_functor.template packetOp<Index,PacketType>(index);
  }

 protected:
@@ -382,9 +371,9 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
  enum {
    CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
    
-    Flags = evaluator<ArgType>::Flags & (
-              HereditaryBits | LinearAccessBit | AlignedBit
-            | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0))
+    Flags = evaluator<ArgType>::Flags
+          & (HereditaryBits | LinearAccessBit | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0)),
+    Alignment = evaluator<ArgType>::Alignment
  };

  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op)
@@ -393,7 +382,6 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
  { }

  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketScalar PacketScalar;

  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
  {
@@ -405,21 +393,21 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
    return m_functor(m_argImpl.coeff(index));
  }

-  template<int LoadMode>
-  PacketScalar packet(Index row, Index col) const
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index row, Index col) const
  {
-    return m_functor.packetOp(m_argImpl.template packet<LoadMode>(row, col));
+    return m_functor.packetOp(m_argImpl.template packet<LoadMode, PacketType>(row, col));
  }

-  template<int LoadMode>
-  PacketScalar packet(Index index) const
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index index) const
  {
-    return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
+    return m_functor.packetOp(m_argImpl.template packet<LoadMode, PacketType>(index));
  }

 protected:
  const UnaryOp m_functor;
-  typename evaluator<ArgType>::nestedType m_argImpl;
+  evaluator<ArgType> m_argImpl;
 };

 // -------------------- CwiseBinaryOp --------------------
@@ -451,13 +439,13 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
    Flags0 = (int(LhsFlags) | int(RhsFlags)) & (
        HereditaryBits
      | (int(LhsFlags) & int(RhsFlags) &
-           ( AlignedBit
-           | (StorageOrdersAgree ? LinearAccessBit : 0)
+           ( (StorageOrdersAgree ? LinearAccessBit : 0)
           | (functor_traits<BinaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)
           )
        )
     ),
-    Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit)
+    Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit),
+    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<Lhs>::Alignment,evaluator<Rhs>::Alignment)
  };

  EIGEN_DEVICE_FUNC explicit binary_evaluator(const XprType& xpr)
@@ -467,7 +455,6 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
  { }

  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketScalar PacketScalar;

  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
  {
@@ -479,24 +466,24 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
    return m_functor(m_lhsImpl.coeff(index), m_rhsImpl.coeff(index));
  }

-  template<int LoadMode>
-  PacketScalar packet(Index row, Index col) const
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index row, Index col) const
  {
-    return m_functor.packetOp(m_lhsImpl.template packet<LoadMode>(row, col),
-                              m_rhsImpl.template packet<LoadMode>(row, col));
+    return m_functor.packetOp(m_lhsImpl.template packet<LoadMode,PacketType>(row, col),
+                              m_rhsImpl.template packet<LoadMode,PacketType>(row, col));
  }

-  template<int LoadMode>
-  PacketScalar packet(Index index) const
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index index) const
  {
-    return m_functor.packetOp(m_lhsImpl.template packet<LoadMode>(index),
-                              m_rhsImpl.template packet<LoadMode>(index));
+    return m_functor.packetOp(m_lhsImpl.template packet<LoadMode,PacketType>(index),
+                              m_rhsImpl.template packet<LoadMode,PacketType>(index));
  }

 protected:
  const BinaryOp m_functor;
-  typename evaluator<Lhs>::nestedType m_lhsImpl;
-  typename evaluator<Rhs>::nestedType m_rhsImpl;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
 };

 // -------------------- CwiseUnaryView --------------------
@@ -510,7 +497,9 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
  enum {
    CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
    
-    Flags = (evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit))
+    Flags = (evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit)),
+    
+    Alignment = 0 // FIXME it is not very clear why alignment is necessarily lost...
  };

  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op)
@@ -543,7 +532,7 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>

 protected:
  const UnaryOp m_unaryOp;
-  typename evaluator<ArgType>::nestedType m_argImpl;
+  evaluator<ArgType> m_argImpl;
 };

 // -------------------- Map --------------------
@@ -560,8 +549,6 @@ struct mapbase_evaluator : evaluator_base<Derived>
  typedef typename XprType::PointerType PointerType;
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketScalar PacketScalar;
-  typedef typename XprType::PacketReturnType PacketReturnType;
  
  enum {
    IsRowMajor = XprType::RowsAtCompileTime,
@@ -597,30 +584,30 @@ struct mapbase_evaluator : evaluator_base<Derived>
    return m_data[index * m_xpr.innerStride()];
  }
 
-  template<int LoadMode> 
-  PacketReturnType packet(Index row, Index col) const 
+  template<int LoadMode, typename PacketType> 
+  PacketType packet(Index row, Index col) const 
  {
    PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
-    return internal::ploadt<PacketScalar, LoadMode>(ptr);
+    return internal::ploadt<PacketType, LoadMode>(ptr);
  }

-  template<int LoadMode> 
-  PacketReturnType packet(Index index) const 
+  template<int LoadMode, typename PacketType> 
+  PacketType packet(Index index) const 
  {
-    return internal::ploadt<PacketScalar, LoadMode>(m_data + index * m_xpr.innerStride());
+    return internal::ploadt<PacketType, LoadMode>(m_data + index * m_xpr.innerStride());
  }
  
-  template<int StoreMode> 
-  void writePacket(Index row, Index col, const PacketScalar& x) 
+  template<int StoreMode, typename PacketType> 
+  void writePacket(Index row, Index col, const PacketType& x) 
  {
    PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
-    return internal::pstoret<Scalar, PacketScalar, StoreMode>(ptr, x);
+    return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x);
  }
  
-  template<int StoreMode> 
-  void writePacket(Index index, const PacketScalar& x) 
+  template<int StoreMode, typename PacketType> 
+  void writePacket(Index index, const PacketType& x) 
  {
-    internal::pstoret<Scalar, PacketScalar, StoreMode>(m_data + index * m_xpr.innerStride(), x);
+    internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_xpr.innerStride(), x);
  }
 
 protected:
@@ -634,6 +621,8 @@ struct evaluator<Map<PlainObjectType, MapOptions, StrideType> >
 {
  typedef Map<PlainObjectType, MapOptions, StrideType> XprType;
  typedef typename XprType::Scalar Scalar;
+  // TODO: should check for smaller packet types once we can handle multi-sized packet types
+  typedef typename packet_traits<Scalar>::type PacketScalar;
  
  enum {
    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
@@ -645,18 +634,21 @@ struct evaluator<Map<PlainObjectType, MapOptions, StrideType> >
    HasNoInnerStride = InnerStrideAtCompileTime == 1,
    HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0,
    HasNoStride = HasNoInnerStride && HasNoOuterStride,
-    IsAligned = bool(EIGEN_ALIGN) && ((int(MapOptions)&Aligned)==Aligned),
    IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic,
+    
+    PacketAlignment = unpacket_traits<PacketScalar>::alignment,
+    
    KeepsPacketAccess = bool(HasNoInnerStride)
                        && ( bool(IsDynamicSize)
                           || HasNoOuterStride
                           || ( OuterStrideAtCompileTime!=Dynamic
-                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%EIGEN_ALIGN_BYTES)==0 ) ),
+                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime) % PacketAlignment)==0 ) ),
    Flags0 = evaluator<PlainObjectType>::Flags,
-    Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit),
-    Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
-           ? int(Flags1) : int(Flags1 & ~LinearAccessBit),
-    Flags = KeepsPacketAccess ? int(Flags2) : (int(Flags2) & ~PacketAccessBit)
+    Flags1 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
+           ? int(Flags0) : int(Flags0 & ~LinearAccessBit),
+    Flags = KeepsPacketAccess ? int(Flags1) : (int(Flags1) & ~PacketAccessBit),
+    
+    Alignment = int(MapOptions)&int(AlignedMask)
  };

  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& map)
@@ -673,7 +665,8 @@ struct evaluator<Ref<PlainObjectType, RefOptions, StrideType> >
  typedef Ref<PlainObjectType, RefOptions, StrideType> XprType;
  
  enum {
-    Flags = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Flags
+    Flags = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Flags,
+    Alignment = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Alignment
  };

  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& ref)
@@ -691,7 +684,9 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
  : block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel>
 {
  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
-  typedef typename XprType::Scalar Scalar; 
+  typedef typename XprType::Scalar Scalar;
+  // TODO: should check for smaller packet types once we can handle multi-sized packet types
+  typedef typename packet_traits<Scalar>::type PacketScalar;
  
  enum {
    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
@@ -717,14 +712,16 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
                       && (InnerStrideAtCompileTime == 1)
                        ? PacketAccessBit : 0,
    
-    MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) ? AlignedBit : 0,
    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,    
    FlagsRowMajorBit = XprType::Flags&RowMajorBit,
    Flags0 = evaluator<ArgType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
                                           DirectAccessBit |
-                                           MaskPacketAccessBit |
-                                           MaskAlignedBit),
-    Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit
+                                           MaskPacketAccessBit),
+    Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit,
+    
+    PacketAlignment = unpacket_traits<PacketScalar>::alignment,
+    Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0,
+    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0)
  };
  typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& block) : block_evaluator_type(block) {}
@@ -756,8 +753,6 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
 
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketScalar PacketScalar;
-  typedef typename XprType::PacketReturnType PacketReturnType;

  enum {
    RowsAtCompileTime = XprType::RowsAtCompileTime
@@ -783,35 +778,35 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
    return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
  }
 
-  template<int LoadMode> 
-  PacketReturnType packet(Index row, Index col) const 
+  template<int LoadMode, typename PacketType> 
+  PacketType packet(Index row, Index col) const 
  { 
-    return m_argImpl.template packet<LoadMode>(m_startRow.value() + row, m_startCol.value() + col); 
+    return m_argImpl.template packet<LoadMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col); 
  }

-  template<int LoadMode> 
-  PacketReturnType packet(Index index) const 
+  template<int LoadMode, typename PacketType> 
+  PacketType packet(Index index) const 
  { 
-    return packet<LoadMode>(RowsAtCompileTime == 1 ? 0 : index,
-                            RowsAtCompileTime == 1 ? index : 0);
+    return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+                                       RowsAtCompileTime == 1 ? index : 0);
  }
  
-  template<int StoreMode> 
-  void writePacket(Index row, Index col, const PacketScalar& x) 
+  template<int StoreMode, typename PacketType> 
+  void writePacket(Index row, Index col, const PacketType& x) 
  { 
-    return m_argImpl.template writePacket<StoreMode>(m_startRow.value() + row, m_startCol.value() + col, x); 
+    return m_argImpl.template writePacket<StoreMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col, x); 
  }
  
-  template<int StoreMode> 
-  void writePacket(Index index, const PacketScalar& x) 
+  template<int StoreMode, typename PacketType> 
+  void writePacket(Index index, const PacketType& x) 
  { 
-    return writePacket<StoreMode>(RowsAtCompileTime == 1 ? 0 : index,
-                                  RowsAtCompileTime == 1 ? index : 0,
-                                  x);
+    return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+                                             RowsAtCompileTime == 1 ? index : 0,
+                                             x);
  }
 
 protected:
-  typename evaluator<ArgType>::nestedType m_argImpl;
+  evaluator<ArgType> m_argImpl;
  const variable_if_dynamic<Index, ArgType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
  const variable_if_dynamic<Index, ArgType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
 };
@@ -825,12 +820,13 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAc
                      typename Block<ArgType, BlockRows, BlockCols, InnerPanel>::PlainObject>
 {
  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+  typedef typename XprType::Scalar Scalar;

  EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block)
    : mapbase_evaluator<XprType, typename XprType::PlainObject>(block) 
  {
    // FIXME this should be an internal assertion
-    eigen_assert(EIGEN_IMPLIES(evaluator<XprType>::Flags&AlignedBit, (size_t(block.data()) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned");
+    eigen_assert(((size_t(block.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator<XprType>::Alignment)) == 0) && "data is not aligned");
  }
 };

@@ -849,7 +845,9 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
                  + EIGEN_SIZE_MAX(evaluator<ThenMatrixType>::CoeffReadCost,
                                   evaluator<ElseMatrixType>::CoeffReadCost),

-    Flags = (unsigned int)evaluator<ThenMatrixType>::Flags & evaluator<ElseMatrixType>::Flags & HereditaryBits
+    Flags = (unsigned int)evaluator<ThenMatrixType>::Flags & evaluator<ElseMatrixType>::Flags & HereditaryBits,
+    
+    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ThenMatrixType>::Alignment, evaluator<ElseMatrixType>::Alignment)
  };

  inline EIGEN_DEVICE_FUNC  explicit evaluator(const XprType& select)
@@ -877,9 +875,9 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
  }
 
 protected:
-  typename evaluator<ConditionMatrixType>::nestedType m_conditionImpl;
-  typename evaluator<ThenMatrixType>::nestedType m_thenImpl;
-  typename evaluator<ElseMatrixType>::nestedType m_elseImpl;
+  evaluator<ConditionMatrixType> m_conditionImpl;
+  evaluator<ThenMatrixType> m_thenImpl;
+  evaluator<ElseMatrixType> m_elseImpl;
 };


@@ -891,7 +889,6 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
 {
  typedef Replicate<ArgType, RowFactor, ColFactor> XprType;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
  enum {
    Factor = (RowFactor==Dynamic || ColFactor==Dynamic) ? Dynamic : RowFactor*ColFactor
  };
@@ -901,7 +898,9 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
  enum {
    CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost,
    
-    Flags = (evaluator<ArgTypeNestedCleaned>::Flags & HereditaryBits & ~RowMajorBit) | (traits<XprType>::Flags & RowMajorBit)
+    Flags = (evaluator<ArgTypeNestedCleaned>::Flags & HereditaryBits & ~RowMajorBit) | (traits<XprType>::Flags & RowMajorBit),
+    
+    Alignment = evaluator<ArgTypeNestedCleaned>::Alignment
  };

  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& replicate)
@@ -923,9 +922,19 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
    
    return m_argImpl.coeff(actual_row, actual_col);
  }
+  
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    // try to avoid using modulo; this is a pure optimization strategy
+    const Index actual_index = internal::traits<XprType>::RowsAtCompileTime==1
+                                  ? (ColFactor==1 ?  index : index%m_cols.value())
+                                  : (RowFactor==1 ?  index : index%m_rows.value());
+    
+    return m_argImpl.coeff(actual_index);
+  }

-  template<int LoadMode>
-  PacketReturnType packet(Index row, Index col) const
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index row, Index col) const
  {
    const Index actual_row = internal::traits<XprType>::RowsAtCompileTime==1 ? 0
                           : RowFactor==1 ? row
@@ -934,12 +943,22 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
                           : ColFactor==1 ? col
                           : col % m_cols.value();

-    return m_argImpl.template packet<LoadMode>(actual_row, actual_col);
+    return m_argImpl.template packet<LoadMode,PacketType>(actual_row, actual_col);
+  }
+  
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index index) const
+  {
+    const Index actual_index = internal::traits<XprType>::RowsAtCompileTime==1
+                                  ? (ColFactor==1 ?  index : index%m_cols.value())
+                                  : (RowFactor==1 ?  index : index%m_rows.value());
+
+    return m_argImpl.template packet<LoadMode,PacketType>(actual_index);
  }
 
 protected:
  const ArgTypeNested m_arg; // FIXME is it OK to store both the argument and its evaluator?? (we have the same situation in evaluator_product)
-  typename evaluator<ArgTypeNestedCleaned>::nestedType m_argImpl;
+  evaluator<ArgTypeNestedCleaned> m_argImpl;
  const variable_if_dynamic<Index, ArgType::RowsAtCompileTime> m_rows;
  const variable_if_dynamic<Index, ArgType::ColsAtCompileTime> m_cols;
 };
@@ -965,7 +984,9 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
    CoeffReadCost = TraversalSize==Dynamic ? Dynamic
                  : TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value),
    
-    Flags = (traits<XprType>::Flags&RowMajorBit) | (evaluator<ArgType>::Flags&HereditaryBits)
+    Flags = (traits<XprType>::Flags&RowMajorBit) | (evaluator<ArgType>::Flags&HereditaryBits),
+    
+    Alignment = 0 // FIXME this could be improved
  };

  EIGEN_DEVICE_FUNC explicit evaluator(const XprType expr)
@@ -1001,15 +1022,14 @@ struct evaluator_wrapper_base
  typedef typename remove_all<typename XprType::NestedExpressionType>::type ArgType;
  enum {
    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
-    Flags = evaluator<ArgType>::Flags
+    Flags = evaluator<ArgType>::Flags,
+    Alignment = evaluator<ArgType>::Alignment
  };

  EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}

  typedef typename ArgType::Scalar Scalar;
  typedef typename ArgType::CoeffReturnType CoeffReturnType;
-  typedef typename ArgType::PacketScalar PacketScalar;
-  typedef typename ArgType::PacketReturnType PacketReturnType;

  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
  {
@@ -1031,32 +1051,32 @@ struct evaluator_wrapper_base
    return m_argImpl.coeffRef(index);
  }

-  template<int LoadMode> 
-  PacketReturnType packet(Index row, Index col) const
+  template<int LoadMode, typename PacketType> 
+  PacketType packet(Index row, Index col) const
  {
-    return m_argImpl.template packet<LoadMode>(row, col);
+    return m_argImpl.template packet<LoadMode,PacketType>(row, col);
  }

-  template<int LoadMode> 
-  PacketReturnType packet(Index index) const
+  template<int LoadMode, typename PacketType> 
+  PacketType packet(Index index) const
  {
-    return m_argImpl.template packet<LoadMode>(index);
+    return m_argImpl.template packet<LoadMode,PacketType>(index);
  }

-  template<int StoreMode> 
-  void writePacket(Index row, Index col, const PacketScalar& x)
+  template<int StoreMode, typename PacketType> 
+  void writePacket(Index row, Index col, const PacketType& x)
  {
    m_argImpl.template writePacket<StoreMode>(row, col, x);
  }

-  template<int StoreMode> 
-  void writePacket(Index index, const PacketScalar& x)
+  template<int StoreMode, typename PacketType> 
+  void writePacket(Index index, const PacketType& x)
  {
    m_argImpl.template writePacket<StoreMode>(index, x);
  }

 protected:
-  typename evaluator<ArgType>::nestedType m_argImpl;
+  evaluator<ArgType> m_argImpl;
 };

 template<typename TArgType>
@@ -1085,7 +1105,7 @@ struct unary_evaluator<ArrayWrapper<TArgType> >
 // -------------------- Reverse --------------------

 // defined in Reverse.h:
-template<typename PacketScalar, bool ReversePacket> struct reverse_packet_cond;
+template<typename PacketType, bool ReversePacket> struct reverse_packet_cond;

 template<typename ArgType, int Direction>
 struct unary_evaluator<Reverse<ArgType, Direction> >
@@ -1094,17 +1114,12 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
  typedef Reverse<ArgType, Direction> XprType;
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketScalar PacketScalar;
-  typedef typename XprType::PacketReturnType PacketReturnType;

  enum {
-    PacketSize = internal::packet_traits<Scalar>::size,
    IsRowMajor = XprType::IsRowMajor,
    IsColMajor = !IsRowMajor,
    ReverseRow = (Direction == Vertical)   || (Direction == BothDirections),
    ReverseCol = (Direction == Horizontal) || (Direction == BothDirections),
-    OffsetRow  = ReverseRow && IsColMajor ? PacketSize : 1,
-    OffsetCol  = ReverseCol && IsRowMajor ? PacketSize : 1,
    ReversePacket = (Direction == BothDirections)
                    || ((Direction == Vertical)   && IsColMajor)
                    || ((Direction == Horizontal) && IsRowMajor),
@@ -1117,9 +1132,10 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
    LinearAccess = ( (Direction==BothDirections) && (int(Flags0)&PacketAccessBit) )
                 ? LinearAccessBit : 0,

-    Flags = int(Flags0) & (HereditaryBits | PacketAccessBit | LinearAccess)
+    Flags = int(Flags0) & (HereditaryBits | PacketAccessBit | LinearAccess),
+    
+    Alignment = 0 // FIXME in some rare cases, Alignment could be preserved, like a Vector4f.
  };
-  typedef internal::reverse_packet_cond<PacketScalar,ReversePacket> reverse_packet;

  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& reverse)
    : m_argImpl(reverse.nestedExpression()),
@@ -1149,38 +1165,53 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
    return m_argImpl.coeffRef(m_rows.value() * m_cols.value() - index - 1);
  }

-  template<int LoadMode>
-  PacketScalar packet(Index row, Index col) const
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index row, Index col) const
  {
-    return reverse_packet::run(m_argImpl.template packet<LoadMode>(
+    enum {
+      PacketSize = unpacket_traits<PacketType>::size,
+      OffsetRow  = ReverseRow && IsColMajor ? PacketSize : 1,
+      OffsetCol  = ReverseCol && IsRowMajor ? PacketSize : 1
+    };
+    typedef internal::reverse_packet_cond<PacketType,ReversePacket> reverse_packet;
+    return reverse_packet::run(m_argImpl.template packet<LoadMode,PacketType>(
                                  ReverseRow ? m_rows.value() - row - OffsetRow : row,
                                  ReverseCol ? m_cols.value() - col - OffsetCol : col));
  }

-  template<int LoadMode>
-  PacketScalar packet(Index index) const
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index index) const
  {
-    return preverse(m_argImpl.template packet<LoadMode>(m_rows.value() * m_cols.value() - index - PacketSize));
+    enum { PacketSize = unpacket_traits<PacketType>::size };
+    return preverse(m_argImpl.template packet<LoadMode,PacketType>(m_rows.value() * m_cols.value() - index - PacketSize));
  }

-  template<int LoadMode>
-  void writePacket(Index row, Index col, const PacketScalar& x)
+  template<int LoadMode, typename PacketType>
+  void writePacket(Index row, Index col, const PacketType& x)
  {
+    // FIXME we could factorize some code with packet(i,j)
+    enum {
+      PacketSize = unpacket_traits<PacketType>::size,
+      OffsetRow  = ReverseRow && IsColMajor ? PacketSize : 1,
+      OffsetCol  = ReverseCol && IsRowMajor ? PacketSize : 1
+    };
+    typedef internal::reverse_packet_cond<PacketType,ReversePacket> reverse_packet;
    m_argImpl.template writePacket<LoadMode>(
                                  ReverseRow ? m_rows.value() - row - OffsetRow : row,
                                  ReverseCol ? m_cols.value() - col - OffsetCol : col,
                                  reverse_packet::run(x));
  }

-  template<int LoadMode>
-  void writePacket(Index index, const PacketScalar& x)
+  template<int LoadMode, typename PacketType>
+  void writePacket(Index index, const PacketType& x)
  {
+    enum { PacketSize = unpacket_traits<PacketType>::size };
    m_argImpl.template writePacket<LoadMode>
      (m_rows.value() * m_cols.value() - index - PacketSize, preverse(x));
  }
 
 protected:
-  typename evaluator<ArgType>::nestedType m_argImpl;
+  evaluator<ArgType> m_argImpl;

  // If we do not reverse rows, then we do not need to know the number of rows; same for columns
  const variable_if_dynamic<Index, ReverseRow ? ArgType::RowsAtCompileTime : 0> m_rows;
@@ -1199,7 +1230,9 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
  enum {
    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
    
-    Flags = (unsigned int)evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit) & ~RowMajorBit
+    Flags = (unsigned int)evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit) & ~RowMajorBit,
+    
+    Alignment = 0
  };

  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& diagonal)
@@ -1233,7 +1266,7 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
  }

 protected:
-  typename evaluator<ArgType>::nestedType m_argImpl;
+  evaluator<ArgType> m_argImpl;
  const internal::variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;

 private:
@@ -1291,15 +1324,12 @@ class EvalToTemp
 
 template<typename ArgType>
 struct evaluator<EvalToTemp<ArgType> >
-  : public evaluator<typename ArgType::PlainObject>::type
+  : public evaluator<typename ArgType::PlainObject>
 {
  typedef EvalToTemp<ArgType>                   XprType;
  typedef typename ArgType::PlainObject         PlainObject;
-  typedef typename evaluator<PlainObject>::type Base;
+  typedef evaluator<PlainObject> Base;
  
-  typedef evaluator type;
-  typedef evaluator nestedType;
-
  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
    : m_result(xpr.rows(), xpr.cols())
  {
--- a/Eigen/src/Core/CoreIterators.h
+++ b/Eigen/src/Core/CoreIterators.h
@@ -34,7 +34,7 @@ class InnerIterator
 {
 protected:
  typedef internal::inner_iterator_selector<XprType, typename internal::evaluator_traits<XprType>::Kind> IteratorType;
-  typedef typename internal::evaluator<XprType>::type EvaluatorType;
+  typedef internal::evaluator<XprType> EvaluatorType;
  typedef typename internal::traits<XprType>::Scalar Scalar;
 public:
  /** Construct an iterator over the \a outerId -th row or column of \a xpr */
@@ -74,7 +74,7 @@ template<typename XprType>
 class inner_iterator_selector<XprType, IndexBased>
 {
 protected:
-  typedef typename evaluator<XprType>::type EvaluatorType;
+  typedef evaluator<XprType> EvaluatorType;
  typedef typename traits<XprType>::Scalar Scalar;
  enum { IsRowMajor = (XprType::Flags&RowMajorBit)==RowMajorBit };
  
@@ -112,7 +112,7 @@ class inner_iterator_selector<XprType, IteratorBased>
 {
 protected:
  typedef typename evaluator<XprType>::InnerIterator Base;
-  typedef typename evaluator<XprType>::type EvaluatorType;
+  typedef evaluator<XprType> EvaluatorType;
  
 public:
  EIGEN_STRONG_INLINE inner_iterator_selector(const EvaluatorType &eval, const Index &outerId, const Index &/*innerSize*/)
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -95,8 +95,8 @@ class CwiseBinaryOp :
                                                      BinaryOp>::ret>::Base Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseBinaryOp)

-    typedef typename internal::nested<LhsType>::type LhsNested;
-    typedef typename internal::nested<RhsType>::type RhsNested;
+    typedef typename internal::ref_selector<LhsType>::type LhsNested;
+    typedef typename internal::ref_selector<RhsType>::type RhsNested;
    typedef typename internal::remove_reference<LhsNested>::type _LhsNested;
    typedef typename internal::remove_reference<RhsNested>::type _RhsNested;

--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -49,13 +49,13 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
    EIGEN_DENSE_PUBLIC_INTERFACE(CwiseNullaryOp)

    EIGEN_DEVICE_FUNC
-    CwiseNullaryOp(Index nbRows, Index nbCols, const NullaryOp& func = NullaryOp())
-      : m_rows(nbRows), m_cols(nbCols), m_functor(func)
+    CwiseNullaryOp(Index rows, Index cols, const NullaryOp& func = NullaryOp())
+      : m_rows(rows), m_cols(cols), m_functor(func)
    {
-      eigen_assert(nbRows >= 0
-            && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == nbRows)
-            &&  nbCols >= 0
-            && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == nbCols));
+      eigen_assert(rows >= 0
+            && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
+            &&  cols >= 0
+            && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols));
    }

    EIGEN_DEVICE_FUNC
@@ -113,10 +113,10 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
  */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
+EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
 {
-  return CwiseNullaryOp<CustomNullaryOp, Derived>(rows, cols, func);
+  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
 }

 /** \returns an expression of a matrix defined by a custom functor \a func
@@ -139,12 +139,12 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
  */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
+EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  if(RowsAtCompileTime == 1) return CwiseNullaryOp<CustomNullaryOp, Derived>(1, size, func);
-  else return CwiseNullaryOp<CustomNullaryOp, Derived>(size, 1, func);
+  if(RowsAtCompileTime == 1) return CwiseNullaryOp<CustomNullaryOp, PlainObject>(1, size, func);
+  else return CwiseNullaryOp<CustomNullaryOp, PlainObject>(size, 1, func);
 }

 /** \returns an expression of a matrix defined by a custom functor \a func
@@ -158,19 +158,19 @@ DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
  */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
+EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
 {
-  return CwiseNullaryOp<CustomNullaryOp, Derived>(RowsAtCompileTime, ColsAtCompileTime, func);
+  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
 }

 /** \returns an expression of a constant matrix of value \a value
  *
-  * The parameters \a nbRows and \a nbCols are the number of rows and of columns of
+  * The parameters \a rows and \a cols are the number of rows and of columns of
  * the returned matrix. Must be compatible with this DenseBase type.
  *
  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
-  * it is redundant to pass \a nbRows and \a nbCols as arguments, so Zero() should be used
+  * it is redundant to pass \a rows and \a cols as arguments, so Zero() should be used
  * instead.
  *
  * The template parameter \a CustomNullaryOp is the type of the functor.
@@ -179,9 +179,9 @@ DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
  */
 template<typename Derived>
 EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Constant(Index nbRows, Index nbCols, const Scalar& value)
+DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
 {
-  return DenseBase<Derived>::NullaryExpr(nbRows, nbCols, internal::scalar_constant_op<Scalar>(value));
+  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value));
 }

 /** \returns an expression of a constant matrix of value \a value
@@ -245,7 +245,7 @@ EIGEN_STRONG_INLINE const typename DenseBase<Derived>::SequentialLinSpacedReturn
 DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,false>(low,high,size));
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,PacketScalar,false>(low,high,size));
 }

 /**
@@ -258,7 +258,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,false>(low,high,Derived::SizeAtCompileTime));
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,PacketScalar,false>(low,high,Derived::SizeAtCompileTime));
 }

 /**
@@ -279,7 +279,7 @@ EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedRetu
 DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,true>(low,high,size));
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,PacketScalar,true>(low,high,size));
 }

 /**
@@ -292,7 +292,7 @@ DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,true>(low,high,Derived::SizeAtCompileTime));
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,PacketScalar,true>(low,high,Derived::SizeAtCompileTime));
 }

 /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
@@ -300,9 +300,10 @@ template<typename Derived>
 bool DenseBase<Derived>::isApproxToConstant
 (const Scalar& val, const RealScalar& prec) const
 {
+  typename internal::nested_eval<Derived,1>::type self(derived());
  for(Index j = 0; j < cols(); ++j)
    for(Index i = 0; i < rows(); ++i)
-      if(!internal::isApprox(this->coeff(i, j), val, prec))
+      if(!internal::isApprox(self.coeff(i, j), val, prec))
        return false;
  return true;
 }
@@ -356,8 +357,8 @@ PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)

 /** Resizes to the given size, and sets all coefficients in this expression to the given \a value.
  *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
  * \param val the value to which all coefficients are set
  *
  * Example: \include Matrix_setConstant_int_int.cpp
@@ -367,9 +368,9 @@ PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
  */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setConstant(Index nbRows, Index nbCols, const Scalar& val)
+PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
 {
-  resize(nbRows, nbCols);
+  resize(rows, cols);
  return setConstant(val);
 }

@@ -390,7 +391,7 @@ template<typename Derived>
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,false>(low,high,newSize));
+  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar,false>(low,high,newSize));
 }

 /**
@@ -428,9 +429,9 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low,
  */
 template<typename Derived>
 EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Zero(Index nbRows, Index nbCols)
+DenseBase<Derived>::Zero(Index rows, Index cols)
 {
-  return Constant(nbRows, nbCols, Scalar(0));
+  return Constant(rows, cols, Scalar(0));
 }

 /** \returns an expression of a zero vector.
@@ -484,9 +485,10 @@ DenseBase<Derived>::Zero()
 template<typename Derived>
 bool DenseBase<Derived>::isZero(const RealScalar& prec) const
 {
+  typename internal::nested_eval<Derived,1>::type self(derived());
  for(Index j = 0; j < cols(); ++j)
    for(Index i = 0; i < rows(); ++i)
-      if(!internal::isMuchSmallerThan(this->coeff(i, j), static_cast<Scalar>(1), prec))
+      if(!internal::isMuchSmallerThan(self.coeff(i, j), static_cast<Scalar>(1), prec))
        return false;
  return true;
 }
@@ -523,8 +525,8 @@ PlainObjectBase<Derived>::setZero(Index newSize)

 /** Resizes to the given size, and sets all coefficients in this expression to zero.
  *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
  *
  * Example: \include Matrix_setZero_int_int.cpp
  * Output: \verbinclude Matrix_setZero_int_int.out
@@ -533,9 +535,9 @@ PlainObjectBase<Derived>::setZero(Index newSize)
  */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setZero(Index nbRows, Index nbCols)
+PlainObjectBase<Derived>::setZero(Index rows, Index cols)
 {
-  resize(nbRows, nbCols);
+  resize(rows, cols);
  return setConstant(Scalar(0));
 }

@@ -543,7 +545,7 @@ PlainObjectBase<Derived>::setZero(Index nbRows, Index nbCols)

 /** \returns an expression of a matrix where all coefficients equal one.
  *
-  * The parameters \a nbRows and \a nbCols are the number of rows and of columns of
+  * The parameters \a rows and \a cols are the number of rows and of columns of
  * the returned matrix. Must be compatible with this MatrixBase type.
  *
  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
@@ -557,9 +559,9 @@ PlainObjectBase<Derived>::setZero(Index nbRows, Index nbCols)
  */
 template<typename Derived>
 EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Ones(Index nbRows, Index nbCols)
+DenseBase<Derived>::Ones(Index rows, Index cols)
 {
-  return Constant(nbRows, nbCols, Scalar(1));
+  return Constant(rows, cols, Scalar(1));
 }

 /** \returns an expression of a vector where all coefficients equal one.
@@ -649,8 +651,8 @@ PlainObjectBase<Derived>::setOnes(Index newSize)

 /** Resizes to the given size, and sets all coefficients in this expression to one.
  *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
  *
  * Example: \include Matrix_setOnes_int_int.cpp
  * Output: \verbinclude Matrix_setOnes_int_int.out
@@ -659,9 +661,9 @@ PlainObjectBase<Derived>::setOnes(Index newSize)
  */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setOnes(Index nbRows, Index nbCols)
+PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
 {
-  resize(nbRows, nbCols);
+  resize(rows, cols);
  return setConstant(Scalar(1));
 }

@@ -669,7 +671,7 @@ PlainObjectBase<Derived>::setOnes(Index nbRows, Index nbCols)

 /** \returns an expression of the identity matrix (not necessarily square).
  *
-  * The parameters \a nbRows and \a nbCols are the number of rows and of columns of
+  * The parameters \a rows and \a cols are the number of rows and of columns of
  * the returned matrix. Must be compatible with this MatrixBase type.
  *
  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
@@ -683,9 +685,9 @@ PlainObjectBase<Derived>::setOnes(Index nbRows, Index nbCols)
  */
 template<typename Derived>
 EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
-MatrixBase<Derived>::Identity(Index nbRows, Index nbCols)
+MatrixBase<Derived>::Identity(Index rows, Index cols)
 {
-  return DenseBase<Derived>::NullaryExpr(nbRows, nbCols, internal::scalar_identity_op<Scalar>());
+  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_identity_op<Scalar>());
 }

 /** \returns an expression of the identity matrix (not necessarily square).
@@ -719,18 +721,19 @@ template<typename Derived>
 bool MatrixBase<Derived>::isIdentity
 (const RealScalar& prec) const
 {
+  typename internal::nested_eval<Derived,1>::type self(derived());
  for(Index j = 0; j < cols(); ++j)
  {
    for(Index i = 0; i < rows(); ++i)
    {
      if(i == j)
      {
-        if(!internal::isApprox(this->coeff(i, j), static_cast<Scalar>(1), prec))
+        if(!internal::isApprox(self.coeff(i, j), static_cast<Scalar>(1), prec))
          return false;
      }
      else
      {
-        if(!internal::isMuchSmallerThan(this->coeff(i, j), static_cast<RealScalar>(1), prec))
+        if(!internal::isMuchSmallerThan(self.coeff(i, j), static_cast<RealScalar>(1), prec))
          return false;
      }
    }
@@ -780,8 +783,8 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()

 /** \brief Resizes to the given size, and writes the identity expression (not necessarily square) into *this.
  *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
  *
  * Example: \include Matrix_setIdentity_int_int.cpp
  * Output: \verbinclude Matrix_setIdentity_int_int.out
@@ -789,9 +792,9 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index nbRows, Index nbCols)
+EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
 {
-  derived().resize(nbRows, nbCols);
+  derived().resize(rows, cols);
  return setIdentity();
 }

--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@@ -84,8 +84,7 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in
    nestedExpression() { return m_matrix.const_cast_derived(); }

  protected:
-    // FIXME changed from MatrixType::Nested because of a weird compilation error with sun CC
-    typename internal::nested<MatrixType>::type m_matrix;
+    typename internal::ref_selector<MatrixType>::type m_matrix;
    ViewOp m_functor;
 };

--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -49,6 +49,8 @@ template<typename Derived> class DenseBase
  public:
    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator*;
+    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
+                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator/;


    /** Inner iterator type to iterate over the coefficients of a row or column.
@@ -66,8 +68,14 @@ template<typename Derived> class DenseBase
     */
    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;

+    /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc. */
    typedef typename internal::traits<Derived>::Scalar Scalar;
-    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+    
+    /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc.
+      *
+      * It is an alias for the Scalar type */
+    typedef Scalar value_type;
+    
    typedef typename NumTraits<Scalar>::Real RealScalar;

    typedef internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
@@ -169,8 +177,39 @@ template<typename Derived> class DenseBase
      InnerStrideAtCompileTime = internal::inner_stride_at_compile_time<Derived>::ret,
      OuterStrideAtCompileTime = internal::outer_stride_at_compile_time<Derived>::ret
    };
+    
+    typedef typename internal::find_best_packet<Scalar,SizeAtCompileTime>::type PacketScalar;

    enum { IsPlainObjectBase = 0 };
+    
+    /** The plain matrix type corresponding to this expression.
+      * \sa PlainObject */
+    typedef Matrix<typename internal::traits<Derived>::Scalar,
+                internal::traits<Derived>::RowsAtCompileTime,
+                internal::traits<Derived>::ColsAtCompileTime,
+                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                internal::traits<Derived>::MaxRowsAtCompileTime,
+                internal::traits<Derived>::MaxColsAtCompileTime
+          > PlainMatrix;
+    
+    /** The plain array type corresponding to this expression.
+      * \sa PlainObject */
+    typedef Array<typename internal::traits<Derived>::Scalar,
+                internal::traits<Derived>::RowsAtCompileTime,
+                internal::traits<Derived>::ColsAtCompileTime,
+                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                internal::traits<Derived>::MaxRowsAtCompileTime,
+                internal::traits<Derived>::MaxColsAtCompileTime
+          > PlainArray;
+
+    /** \brief The plain matrix or array type corresponding to this expression.
+      *
+      * This is not necessarily exactly the return type of eval(). In the case of plain matrices,
+      * the return type of eval() is a const reference to a matrix, not a matrix! It is however guaranteed
+      * that the return type of eval() is either PlainObject or const PlainObject&.
+      */
+    typedef typename internal::conditional<internal::is_same<typename internal::traits<Derived>::XprKind,MatrixXpr >::value,
+                                 PlainMatrix, PlainArray>::type PlainObject;

    /** \returns the number of nonzero coefficients which is in practice the number
      * of stored coefficients. */
@@ -221,22 +260,21 @@ template<typename Derived> class DenseBase
      * nothing else.
      */
    EIGEN_DEVICE_FUNC
-    void resize(Index nbRows, Index nbCols)
+    void resize(Index rows, Index cols)
    {
-      EIGEN_ONLY_USED_FOR_DEBUG(nbRows);
-      EIGEN_ONLY_USED_FOR_DEBUG(nbCols);
-      eigen_assert(nbRows == this->rows() && nbCols == this->cols()
+      EIGEN_ONLY_USED_FOR_DEBUG(rows);
+      EIGEN_ONLY_USED_FOR_DEBUG(cols);
+      eigen_assert(rows == this->rows() && cols == this->cols()
                && "DenseBase::resize() does not actually allow to resize.");
    }

 #ifndef EIGEN_PARSED_BY_DOXYGEN
-
    /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
    /** \internal Represents a vector with linearly spaced coefficients that allows sequential access only. */
-    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,false>,Derived> SequentialLinSpacedReturnType;
+    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,PacketScalar,false>,PlainObject> SequentialLinSpacedReturnType;
    /** \internal Represents a vector with linearly spaced coefficients that allows random access. */
-    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,true>,Derived> RandomAccessLinSpacedReturnType;
+    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,PacketScalar,true>,PlainObject> RandomAccessLinSpacedReturnType;
    /** \internal the return type of MatrixBase::eigenvalues() */
    typedef Matrix<typename NumTraits<typename internal::traits<Derived>::Scalar>::Real, internal::traits<Derived>::ColsAtCompileTime, 1> EigenvaluesReturnType;

@@ -269,18 +307,17 @@ template<typename Derived> class DenseBase
    EIGEN_DEVICE_FUNC
    Derived& operator=(const ReturnByValue<OtherDerived>& func);

-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** Copies \a other into *this without evaluating other. \returns a reference to *this.
+    /** \ínternal
+      * Copies \a other into *this without evaluating other. \returns a reference to *this.
      * \deprecated */
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
    Derived& lazyAssign(const DenseBase<OtherDerived>& other);
-#endif // not EIGEN_PARSED_BY_DOXYGEN

    EIGEN_DEVICE_FUNC
    CommaInitializer<Derived> operator<< (const Scalar& s);

-    // TODO flagged is temporarly disabled. It seems useless now
+    /** \deprecated it now returns \c *this */
    template<unsigned int Added,unsigned int Removed>
    EIGEN_DEPRECATED
    const Derived& flagged() const
@@ -316,13 +353,13 @@ template<typename Derived> class DenseBase
    LinSpaced(const Scalar& low, const Scalar& high);

    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
-    static const CwiseNullaryOp<CustomNullaryOp, Derived>
+    static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
    NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func);
    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
-    static const CwiseNullaryOp<CustomNullaryOp, Derived>
+    static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
    NullaryExpr(Index size, const CustomNullaryOp& func);
    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
-    static const CwiseNullaryOp<CustomNullaryOp, Derived>
+    static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
    NullaryExpr(const CustomNullaryOp& func);

    EIGEN_DEVICE_FUNC static const ConstantReturnType Zero(Index rows, Index cols);
@@ -368,6 +405,8 @@ template<typename Derived> class DenseBase
      *
      * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
      * a const reference, in order to avoid a useless copy.
+      * 
+      * \warning Be carefull with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
      */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE EvalReturnType eval() const
@@ -429,8 +468,7 @@ template<typename Derived> class DenseBase

    template<typename BinaryOp>
    EIGEN_DEVICE_FUNC
-    typename internal::result_of<BinaryOp(typename internal::traits<Derived>::Scalar)>::type
-    redux(const BinaryOp& func) const;
+    Scalar redux(const BinaryOp& func) const;

    template<typename Visitor>
    EIGEN_DEVICE_FUNC
@@ -456,14 +494,35 @@ template<typename Derived> class DenseBase
    typedef VectorwiseOp<Derived, Vertical> ColwiseReturnType;
    typedef const VectorwiseOp<const Derived, Vertical> ConstColwiseReturnType;

-    ConstRowwiseReturnType rowwise() const;
-    RowwiseReturnType rowwise();
-    ConstColwiseReturnType colwise() const;
-    ColwiseReturnType colwise();
+    /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
+    *
+    * Example: \include MatrixBase_rowwise.cpp
+    * Output: \verbinclude MatrixBase_rowwise.out
+    *
+    * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
+    */
+    //Code moved here due to a CUDA compiler bug
+    EIGEN_DEVICE_FUNC inline ConstRowwiseReturnType rowwise() const {
+      return ConstRowwiseReturnType(derived());
+    }
+    EIGEN_DEVICE_FUNC RowwiseReturnType rowwise();

-    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random(Index rows, Index cols);
-    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random(Index size);
-    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random();
+    /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
+    *
+    * Example: \include MatrixBase_colwise.cpp
+    * Output: \verbinclude MatrixBase_colwise.out
+    *
+    * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
+    */
+    EIGEN_DEVICE_FUNC inline ConstColwiseReturnType colwise() const {
+      return ConstColwiseReturnType(derived());
+    }
+    EIGEN_DEVICE_FUNC ColwiseReturnType colwise();
+
+    typedef CwiseNullaryOp<internal::scalar_random_op<Scalar>,PlainObject> RandomReturnType;
+    static const RandomReturnType Random(Index rows, Index cols);
+    static const RandomReturnType Random(Index size);
+    static const RandomReturnType Random();

    template<typename ThenDerived,typename ElseDerived>
    const Select<Derived,ThenDerived,ElseDerived>
@@ -481,14 +540,33 @@ template<typename Derived> class DenseBase
    template<int p> RealScalar lpNorm() const;

    template<int RowFactor, int ColFactor>
+    EIGEN_DEVICE_FUNC
    const Replicate<Derived,RowFactor,ColFactor> replicate() const;
-    const Replicate<Derived,Dynamic,Dynamic> replicate(Index rowFacor,Index colFactor) const;
+    /**
+    * \return an expression of the replication of \c *this
+    *
+    * Example: \include MatrixBase_replicate_int_int.cpp
+    * Output: \verbinclude MatrixBase_replicate_int_int.out
+    *
+    * \sa VectorwiseOp::replicate(), DenseBase::replicate<int,int>(), class Replicate
+    */
+    //Code moved here due to a CUDA compiler bug
+    EIGEN_DEVICE_FUNC
+    const Replicate<Derived, Dynamic, Dynamic> replicate(Index rowFactor, Index colFactor) const
+    {
+      return Replicate<Derived, Dynamic, Dynamic>(derived(), rowFactor, colFactor);
+    }

    typedef Reverse<Derived, BothDirections> ReverseReturnType;
    typedef const Reverse<const Derived, BothDirections> ConstReverseReturnType;
-    ReverseReturnType reverse();
-    ConstReverseReturnType reverse() const;
-    void reverseInPlace();
+    EIGEN_DEVICE_FUNC ReverseReturnType reverse();
+    /** This is the const version of reverse(). */
+    //Code moved here due to a CUDA compiler bug
+    EIGEN_DEVICE_FUNC ConstReverseReturnType reverse() const
+    {
+      return ConstReverseReturnType(derived());
+    }
+    EIGEN_DEVICE_FUNC void reverseInPlace();

 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase
 #   include "../plugins/BlockMethods.h"
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@@ -97,7 +97,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
    {
      eigen_internal_assert(row >= 0 && row < rows()
                         && col >= 0 && col < cols());
-      return typename internal::evaluator<Derived>::type(derived()).coeff(row,col);
+      return internal::evaluator<Derived>(derived()).coeff(row,col);
    }

    EIGEN_DEVICE_FUNC
@@ -139,7 +139,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
    coeff(Index index) const
    {
      eigen_internal_assert(index >= 0 && index < size());
-      return typename internal::evaluator<Derived>::type(derived()).coeff(index);
+      return internal::evaluator<Derived>(derived()).coeff(index);
    }


@@ -216,8 +216,9 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
    template<int LoadMode>
    EIGEN_STRONG_INLINE PacketReturnType packet(Index row, Index col) const
    {
+      typedef typename internal::packet_traits<Scalar>::type DefaultPacketType;
      eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
-      return typename internal::evaluator<Derived>::type(derived()).template packet<LoadMode>(row,col);
+      return internal::evaluator<Derived>(derived()).template packet<LoadMode,DefaultPacketType>(row,col);
    }


@@ -242,8 +243,9 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
    template<int LoadMode>
    EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
    {
+      typedef typename internal::packet_traits<Scalar>::type DefaultPacketType;
      eigen_internal_assert(index >= 0 && index < size());
-      return typename internal::evaluator<Derived>::type(derived()).template packet<LoadMode>(index);
+      return internal::evaluator<Derived>(derived()).template packet<LoadMode,DefaultPacketType>(index);
    }

  protected:
@@ -323,7 +325,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
    {
      eigen_internal_assert(row >= 0 && row < rows()
                         && col >= 0 && col < cols());
-      return typename internal::evaluator<Derived>::type(derived()).coeffRef(row,col);
+      return internal::evaluator<Derived>(derived()).coeffRef(row,col);
    }

    EIGEN_DEVICE_FUNC
@@ -369,7 +371,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
    coeffRef(Index index)
    {
      eigen_internal_assert(index >= 0 && index < size());
-      return typename internal::evaluator<Derived>::type(derived()).coeffRef(index);
+      return internal::evaluator<Derived>(derived()).coeffRef(index);
    }

    /** \returns a reference to the coefficient at given index.
@@ -580,33 +582,42 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>

 namespace internal {

-template<typename Derived, bool JustReturnZero>
+template<int Alignment, typename Derived, bool JustReturnZero>
 struct first_aligned_impl
 {
  static inline Index run(const Derived&)
  { return 0; }
 };

-template<typename Derived>
-struct first_aligned_impl<Derived, false>
+template<int Alignment, typename Derived>
+struct first_aligned_impl<Alignment, Derived, false>
 {
  static inline Index run(const Derived& m)
  {
-    return internal::first_aligned(&m.const_cast_derived().coeffRef(0,0), m.size());
+    return internal::first_aligned<Alignment>(&m.const_cast_derived().coeffRef(0,0), m.size());
  }
 };

-/** \internal \returns the index of the first element of the array that is well aligned for vectorization.
+/** \internal \returns the index of the first element of the array stored by \a m that is properly aligned with respect to \a Alignment for vectorization.
+  *
+  * \tparam Alignment requested alignment in Bytes.
  *
  * There is also the variant first_aligned(const Scalar*, Integer) defined in Memory.h. See it for more
  * documentation.
  */
-template<typename Derived>
-static inline Index first_aligned(const Derived& m)
+template<int Alignment, typename Derived>
+static inline Index first_aligned(const DenseBase<Derived>& m)
 {
-  return first_aligned_impl
-          <Derived, (Derived::Flags & AlignedBit) || !(Derived::Flags & DirectAccessBit)>
-          ::run(m);
+  enum { ReturnZero = (int(evaluator<Derived>::Alignment) >= Alignment) || !(Derived::Flags & DirectAccessBit) };
+  return first_aligned_impl<Alignment, Derived, ReturnZero>::run(m.derived());
+}
+
+template<typename Derived>
+static inline Index first_default_aligned(const DenseBase<Derived>& m)
+{
+  typedef typename Derived::Scalar Scalar;
+  typedef typename packet_traits<Scalar>::type DefaultPacketType;
+  return first_aligned<unpacket_traits<DefaultPacketType>::alignment>(m);
 }

 template<typename Derived, bool HasDirectAccess = has_direct_access<Derived>::ret>
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -40,8 +40,7 @@ void check_static_allocation_size()
  */
 template <typename T, int Size, int MatrixOrArrayOptions,
          int Alignment = (MatrixOrArrayOptions&DontAlign) ? 0
-                        : (((Size*sizeof(T))%EIGEN_ALIGN_BYTES)==0) ? EIGEN_ALIGN_BYTES
-                        : 0 >
+                        : compute_default_alignment<T,Size>::value >
 struct plain_array
 {
  T array[Size];
@@ -81,14 +80,71 @@ struct plain_array
 #endif

 template <typename T, int Size, int MatrixOrArrayOptions>
-struct plain_array<T, Size, MatrixOrArrayOptions, EIGEN_ALIGN_BYTES>
+struct plain_array<T, Size, MatrixOrArrayOptions, 8>
 {
-  EIGEN_USER_ALIGN_DEFAULT T array[Size];
+  EIGEN_ALIGN_TO_BOUNDARY(8) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array() 
+  {
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(7);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert) 
+  { 
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 16>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(16) T array[Size];

  EIGEN_DEVICE_FUNC
  plain_array() 
  { 
-    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(EIGEN_ALIGN_BYTES-1);
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(15);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert) 
+  { 
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 32>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(32) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array() 
+  {
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(31);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert) 
+  { 
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 64>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(64) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array() 
+  { 
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(63);
    check_static_allocation_size<T,Size>();
  }

@@ -102,7 +158,7 @@ struct plain_array<T, Size, MatrixOrArrayOptions, EIGEN_ALIGN_BYTES>
 template <typename T, int MatrixOrArrayOptions, int Alignment>
 struct plain_array<T, 0, MatrixOrArrayOptions, Alignment>
 {
-  EIGEN_USER_ALIGN_DEFAULT T array[1];
+  T array[1];
  EIGEN_DEVICE_FUNC plain_array() {}
  EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {}
 };
@@ -140,7 +196,13 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
      if (this != &other) m_data = other.m_data;
      return *this; 
    }
-    EIGEN_DEVICE_FUNC DenseStorage(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols);
+      EIGEN_UNUSED_VARIABLE(size);
+      EIGEN_UNUSED_VARIABLE(rows);
+      EIGEN_UNUSED_VARIABLE(cols);
+    }
    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
@@ -186,10 +248,10 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic
    Index m_cols;
  public:
    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0), m_cols(0) {}
-    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {}
-    DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows), m_cols(other.m_cols) {}
-    DenseStorage& operator=(const DenseStorage& other) 
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows), m_cols(other.m_cols) {}
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) 
    { 
      if (this != &other)
      {
@@ -199,13 +261,13 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic
      }
      return *this; 
    }
-    DenseStorage(Index, Index nbRows, Index nbCols) : m_rows(nbRows), m_cols(nbCols) {}
-    void swap(DenseStorage& other)
+    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
    EIGEN_DEVICE_FUNC Index rows() const {return m_rows;}
    EIGEN_DEVICE_FUNC Index cols() const {return m_cols;}
-    void conservativeResize(Index, Index nbRows, Index nbCols) { m_rows = nbRows; m_cols = nbCols; }
-    void resize(Index, Index nbRows, Index nbCols) { m_rows = nbRows; m_cols = nbCols; }
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
+    EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
@@ -217,10 +279,10 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si
    Index m_rows;
  public:
    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0) {}
-    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {}
-    DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows) {}
-    DenseStorage& operator=(const DenseStorage& other) 
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows) {}
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) 
    {
      if (this != &other)
      {
@@ -229,12 +291,12 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si
      }
      return *this; 
    }
-    DenseStorage(Index, Index nbRows, Index) : m_rows(nbRows) {}
-    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
    EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;}
-    void conservativeResize(Index, Index nbRows, Index) { m_rows = nbRows; }
-    void resize(Index, Index nbRows, Index) { m_rows = nbRows; }
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
+    EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index) { m_rows = rows; }
    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
@@ -246,10 +308,10 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
    Index m_cols;
  public:
    EIGEN_DEVICE_FUNC DenseStorage() : m_cols(0) {}
-    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(0) {}
-    DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_cols(other.m_cols) {}
-    DenseStorage& operator=(const DenseStorage& other)
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_cols(other.m_cols) {}
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
    {
      if (this != &other)
      {
@@ -258,12 +320,12 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
      }
      return *this;
    }
-    DenseStorage(Index, Index, Index nbCols) : m_cols(nbCols) {}
-    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
    EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;}
    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
-    void conservativeResize(Index, Index, Index nbCols) { m_cols = nbCols; }
-    void resize(Index, Index, Index nbCols) { m_cols = nbCols; }
+    void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
+    void resize(Index, Index, Index cols) { m_cols = cols; }
    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
@@ -276,19 +338,22 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
    Index m_cols;
  public:
    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0), m_cols(0) {}
-    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(0), m_rows(0), m_cols(0) {}
-    DenseStorage(Index size, Index nbRows, Index nbCols)
-      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows), m_cols(nbCols)
-    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-    DenseStorage(const DenseStorage& other)
+    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols)
+      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows), m_cols(cols)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      eigen_internal_assert(size==rows*cols && rows>=0 && cols >=0);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*other.m_cols))
      , m_rows(other.m_rows)
      , m_cols(other.m_cols)
    {
      internal::smart_copy(other.m_data, other.m_data+other.m_rows*other.m_cols, m_data);
    }
-    DenseStorage& operator=(const DenseStorage& other)
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
    {
      if (this != &other)
      {
@@ -298,6 +363,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
      return *this;
    }
 #ifdef EIGEN_HAVE_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
    DenseStorage(DenseStorage&& other)
      : m_data(std::move(other.m_data))
      , m_rows(std::move(other.m_rows))
@@ -307,6 +373,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
      other.m_rows = 0;
      other.m_cols = 0;
    }
+    EIGEN_DEVICE_FUNC
    DenseStorage& operator=(DenseStorage&& other)
    {
      using std::swap;
@@ -316,18 +383,18 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
      return *this;
    }
 #endif
-    ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
-    void swap(DenseStorage& other)
+    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
-    void conservativeResize(Index size, Index nbRows, Index nbCols)
+    void conservativeResize(Index size, Index rows, Index cols)
    {
      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*m_cols);
-      m_rows = nbRows;
-      m_cols = nbCols;
+      m_rows = rows;
+      m_cols = cols;
    }
-    void resize(Index size, Index nbRows, Index nbCols)
+    EIGEN_DEVICE_FUNC void resize(Index size, Index rows, Index cols)
    {
      if(size != m_rows*m_cols)
      {
@@ -338,8 +405,8 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
          m_data = 0;
        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
      }
-      m_rows = nbRows;
-      m_cols = nbCols;
+      m_rows = rows;
+      m_cols = cols;
    }
    EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
    EIGEN_DEVICE_FUNC T *data() { return m_data; }
@@ -353,15 +420,19 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
  public:
    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_cols(0) {}
    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
-    DenseStorage(Index size, Index, Index nbCols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(nbCols)
-    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-    DenseStorage(const DenseStorage& other)
+    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(cols)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      eigen_internal_assert(size==rows*cols && rows==_Rows && cols >=0);
+      EIGEN_UNUSED_VARIABLE(rows);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(_Rows*other.m_cols))
      , m_cols(other.m_cols)
    {
      internal::smart_copy(other.m_data, other.m_data+_Rows*m_cols, m_data);
    }
-    DenseStorage& operator=(const DenseStorage& other)
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
    {
      if (this != &other)
      {
@@ -371,6 +442,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
      return *this;
    }    
 #ifdef EIGEN_HAVE_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
    DenseStorage(DenseStorage&& other)
      : m_data(std::move(other.m_data))
      , m_cols(std::move(other.m_cols))
@@ -378,6 +450,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
      other.m_data = nullptr;
      other.m_cols = 0;
    }
+    EIGEN_DEVICE_FUNC
    DenseStorage& operator=(DenseStorage&& other)
    {
      using std::swap;
@@ -386,16 +459,16 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
      return *this;
    }
 #endif
-    ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
-    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
-    void conservativeResize(Index size, Index, Index nbCols)
+    EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols)
    {
      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, _Rows*m_cols);
-      m_cols = nbCols;
+      m_cols = cols;
    }
-    EIGEN_STRONG_INLINE void resize(Index size, Index, Index nbCols)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index size, Index, Index cols)
    {
      if(size != _Rows*m_cols)
      {
@@ -406,7 +479,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
          m_data = 0;
        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
      }
-      m_cols = nbCols;
+      m_cols = cols;
    }
    EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
    EIGEN_DEVICE_FUNC T *data() { return m_data; }
@@ -420,15 +493,19 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
  public:
    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0) {}
    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
-    DenseStorage(Index size, Index nbRows, Index) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows)
-    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-    DenseStorage(const DenseStorage& other)
+    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      eigen_internal_assert(size==rows*cols && rows>=0 && cols == _Cols);
+      EIGEN_UNUSED_VARIABLE(cols);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*_Cols))
      , m_rows(other.m_rows)
    {
      internal::smart_copy(other.m_data, other.m_data+other.m_rows*_Cols, m_data);
    }
-    DenseStorage& operator=(const DenseStorage& other)
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
    {
      if (this != &other)
      {
@@ -438,6 +515,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
      return *this;
    }    
 #ifdef EIGEN_HAVE_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
    DenseStorage(DenseStorage&& other)
      : m_data(std::move(other.m_data))
      , m_rows(std::move(other.m_rows))
@@ -445,6 +523,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
      other.m_data = nullptr;
      other.m_rows = 0;
    }
+    EIGEN_DEVICE_FUNC
    DenseStorage& operator=(DenseStorage&& other)
    {
      using std::swap;
@@ -453,16 +532,16 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
      return *this;
    }
 #endif
-    ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
-    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
-    void conservativeResize(Index size, Index nbRows, Index)
+    void conservativeResize(Index size, Index rows, Index)
    {
      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*_Cols);
-      m_rows = nbRows;
+      m_rows = rows;
    }
-    EIGEN_STRONG_INLINE void resize(Index size, Index nbRows, Index)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index size, Index rows, Index)
    {
      if(size != m_rows*_Cols)
      {
@@ -473,7 +552,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
          m_data = 0;
        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
      }
-      m_rows = nbRows;
+      m_rows = rows;
    }
    EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
    EIGEN_DEVICE_FUNC T *data() { return m_data; }
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -37,7 +37,7 @@ template<typename MatrixType, int DiagIndex>
 struct traits<Diagonal<MatrixType,DiagIndex> >
 : traits<MatrixType>
 {
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
  typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
  typedef typename MatrixType::StorageKind StorageKind;
  enum {
@@ -170,7 +170,7 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
    EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value()>0 ? 0 : -m_index.value(); }
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value()>0 ? m_index.value() : 0; }
-    // trigger a compile time error is someone try to call packet
+    // trigger a compile-time error if someone try to call packet
    template<int LoadMode> typename MatrixType::PacketReturnType packet(Index) const;
    template<int LoadMode> typename MatrixType::PacketReturnType packet(Index,Index) const;
 };
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -99,7 +99,7 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
 template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
 {
-  using std::sqrt;
+  EIGEN_USING_STD_MATH(sqrt)
  return sqrt(squaredNorm());
 }

@@ -141,7 +141,7 @@ struct lpNorm_selector
  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const MatrixBase<Derived>& m)
  {
-    using std::pow;
+    EIGEN_USING_STD_MATH(pow)
    return pow(m.cwiseAbs().array().pow(p).sum(), RealScalar(1)/p);
  }
 };
@@ -224,13 +224,13 @@ bool MatrixBase<Derived>::isOrthogonal
 template<typename Derived>
 bool MatrixBase<Derived>::isUnitary(const RealScalar& prec) const
 {
-  typename Derived::Nested nested(derived());
+  typename internal::nested_eval<Derived,1>::type self(derived());
  for(Index i = 0; i < cols(); ++i)
  {
-    if(!internal::isApprox(nested.col(i).squaredNorm(), static_cast<RealScalar>(1), prec))
+    if(!internal::isApprox(self.col(i).squaredNorm(), static_cast<RealScalar>(1), prec))
      return false;
    for(Index j = 0; j < i; ++j)
-      if(!internal::isMuchSmallerThan(nested.col(i).dot(nested.col(j)), static_cast<Scalar>(1), prec))
+      if(!internal::isMuchSmallerThan(self.col(i).dot(self.col(j)), static_cast<Scalar>(1), prec))
        return false;
  }
  return true;
--- a/Eigen/src/Core/Flagged.h
+++ b/Eigen/src/Core/Flagged.h
@@ -1,140 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_FLAGGED_H
-#define EIGEN_FLAGGED_H
-
-namespace Eigen { 
-
-/** \class Flagged
-  * \ingroup Core_Module
-  *
-  * \brief Expression with modified flags
-  *
-  * \param ExpressionType the type of the object of which we are modifying the flags
-  * \param Added the flags added to the expression
-  * \param Removed the flags removed from the expression (has priority over Added).
-  *
-  * This class represents an expression whose flags have been modified.
-  * It is the return type of MatrixBase::flagged()
-  * and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::flagged()
-  */
-
-namespace internal {
-template<typename ExpressionType, unsigned int Added, unsigned int Removed>
-struct traits<Flagged<ExpressionType, Added, Removed> > : traits<ExpressionType>
-{
-  enum { Flags = (ExpressionType::Flags | Added) & ~Removed };
-};
-}
-
-template<typename ExpressionType, unsigned int Added, unsigned int Removed> class Flagged
-  : public MatrixBase<Flagged<ExpressionType, Added, Removed> >
-{
-  public:
-
-    typedef MatrixBase<Flagged> Base;
-    
-    EIGEN_DENSE_PUBLIC_INTERFACE(Flagged)
-    typedef typename internal::conditional<internal::must_nest_by_value<ExpressionType>::ret,
-        ExpressionType, const ExpressionType&>::type ExpressionTypeNested;
-    typedef typename ExpressionType::InnerIterator InnerIterator;
-
-    explicit inline Flagged(const ExpressionType& matrix) : m_matrix(matrix) {}
-
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols(); }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_matrix.outerStride(); }
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_matrix.innerStride(); }
-
-    EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index row, Index col) const
-    {
-      return m_matrix.coeff(row, col);
-    }
-
-    EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index index) const
-    {
-      return m_matrix.coeff(index);
-    }
-    
-    EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index row, Index col) const
-    {
-      return m_matrix.const_cast_derived().coeffRef(row, col);
-    }
-
-    EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const
-    {
-      return m_matrix.const_cast_derived().coeffRef(index);
-    }
-
-    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
-    {
-      return m_matrix.const_cast_derived().coeffRef(row, col);
-    }
-
-    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
-    {
-      return m_matrix.const_cast_derived().coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index row, Index col) const
-    {
-      return m_matrix.template packet<LoadMode>(row, col);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index row, Index col, const PacketScalar& x)
-    {
-      m_matrix.const_cast_derived().template writePacket<LoadMode>(row, col, x);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return m_matrix.template packet<LoadMode>(index);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
-    {
-      m_matrix.const_cast_derived().template writePacket<LoadMode>(index, x);
-    }
-
-    EIGEN_DEVICE_FUNC const ExpressionType& _expression() const { return m_matrix; }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC typename ExpressionType::PlainObject solveTriangular(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC void solveTriangularInPlace(const MatrixBase<OtherDerived>& other) const;
-
-  protected:
-    ExpressionTypeNested m_matrix;
-};
-
-/** \returns an expression of *this with added and removed flags
-  *
-  * This is mostly for internal use.
-  *
-  * \sa class Flagged
-  */
-template<typename Derived>
-template<unsigned int Added,unsigned int Removed>
-inline const Flagged<Derived, Added, Removed>
-DenseBase<Derived>::flagged() const
-{
-  return Flagged<Derived, Added, Removed>(derived());
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_FLAGGED_H
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -183,7 +183,7 @@ struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
 template<typename Scalar,int Size,int MaxSize>
 struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
 {
-  #if EIGEN_ALIGN_STATICALLY
+  #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0> m_data;
  EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
  #else
@@ -196,7 +196,7 @@ struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
  EIGEN_STRONG_INLINE Scalar* data() {
    return ForceAlignment
-            ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES)
+            ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
            : m_data.array;
  }
  #endif
@@ -249,8 +249,8 @@ template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true>

    gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;

-    bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
-    bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
+    const bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
+    const bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;

    RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);

--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -49,6 +49,7 @@ struct default_packet_traits
    HasMul    = 1,
    HasNegate = 1,
    HasAbs    = 1,
+    HasArg    = 0,
    HasAbs2   = 1,
    HasMin    = 1,
    HasMax    = 1,
@@ -58,8 +59,10 @@ struct default_packet_traits

    HasDiv    = 0,
    HasSqrt   = 0,
+    HasRsqrt  = 0,
    HasExp    = 0,
    HasLog    = 0,
+    HasLog10    = 0,
    HasPow    = 0,

    HasSin    = 0,
@@ -67,7 +70,14 @@ struct default_packet_traits
    HasTan    = 0,
    HasASin   = 0,
    HasACos   = 0,
-    HasATan   = 0
+    HasATan   = 0,
+    HasSinh    = 0,
+    HasCosh    = 0,
+    HasTanh    = 0,
+
+    HasRound  = 0,
+    HasFloor  = 0,
+    HasCeil   = 0
  };
 };

@@ -97,6 +107,28 @@ template<typename T> struct packet_traits : default_packet_traits

 template<typename T> struct packet_traits<const T> : packet_traits<T> { };

+template <typename Src, typename Tgt> struct type_casting_traits {
+  enum {
+    VectorizedCast = 0,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+
+/** \internal \returns static_cast<TgtType>(a) (coeff-wise) */
+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket
+pcast(const SrcPacket& a) {
+  return static_cast<TgtPacket>(a);
+}
+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket
+pcast(const SrcPacket& a, const SrcPacket& /*b*/) {
+  return static_cast<TgtPacket>(a);
+}
+
+
 /** \internal \returns a + b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 padd(const Packet& a,
@@ -140,6 +172,10 @@ pmax(const Packet& a,
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pabs(const Packet& a) { using std::abs; return abs(a); }

+/** \internal \returns the phase angle of \a a */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+parg(const Packet& a) { using numext::arg; return arg(a); }
+
 /** \internal \returns the bitwise and of \a a and \a b */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pand(const Packet& a, const Packet& b) { return a & b; }
@@ -225,8 +261,8 @@ inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
 }

 /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
-template<typename Scalar> inline typename packet_traits<Scalar>::type
-plset(const Scalar& a) { return a; }
+template<typename Packet> inline Packet
+plset(const typename unpacket_traits<Packet>::type& a) { return a; }

 /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
 template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from)
@@ -245,7 +281,15 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
 /** \internal tries to do cache prefetching of \a addr */
 template<typename Scalar> inline void prefetch(const Scalar* addr)
 {
-#if !EIGEN_COMP_MSVC
+#ifdef __CUDA_ARCH__
+#if defined(__LP64__)
+  // 64-bit pointer operand constraint for inlined asm
+  asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
+#else
+  // 32-bit pointer operand constraint for inlined asm
+  asm(" prefetch.L1 [ %1 ];" : "=r"(addr) : "r"(addr));
+#endif
+#elif !EIGEN_COMP_MSVC
  __builtin_prefetch(addr);
 #endif
 }
@@ -287,6 +331,21 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Pack
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
 { return a; }

+template<size_t offset, typename Packet>
+struct protate_impl
+{
+  // Empty so attempts to use this unimplemented path will fail to compile.
+  // Only specializations of this template should be used.
+};
+
+/** \internal \returns a packet with the coefficients rotated to the right in little-endian convention,
+  * by the given offset, e.g. for offset == 1:
+  *     (packet[3], packet[2], packet[1], packet[0]) becomes (packet[0], packet[3], packet[2], packet[1])
+  */
+template<size_t offset, typename Packet> EIGEN_DEVICE_FUNC inline Packet protate(const Packet& a)
+{
+  return offset ? protate_impl<offset, Packet>::run(a) : a;
+}

 /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
@@ -321,10 +380,22 @@ Packet pasin(const Packet& a) { using std::asin; return asin(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pacos(const Packet& a) { using std::acos; return acos(a); }

-/** \internal \returns the atan of \a a (coeff-wise) */
+/** \internal \returns the arc tangent of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet patan(const Packet& a) { using std::atan; return atan(a); }

+/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet psinh(const Packet& a) { using std::sinh; return sinh(a); }
+
+/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pcosh(const Packet& a) { using std::cosh; return cosh(a); }
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet ptanh(const Packet& a) { using std::tanh; return tanh(a); }
+
 /** \internal \returns the exp of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pexp(const Packet& a) { using std::exp; return exp(a); }
@@ -333,10 +404,32 @@ Packet pexp(const Packet& a) { using std::exp; return exp(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet plog(const Packet& a) { using std::log; return log(a); }

+/** \internal \returns the log10 of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plog10(const Packet& a) { using std::log10; return log10(a); }
+
 /** \internal \returns the square-root of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); }

+/** \internal \returns the reciprocal square-root of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet prsqrt(const Packet& a) {
+  return pdiv(pset1<Packet>(1), psqrt(a));
+}
+
+/** \internal \returns the rounded value of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pround(const Packet& a) { using numext::round; return round(a); }
+
+/** \internal \returns the floor of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pfloor(const Packet& a) { using numext::floor; return floor(a); }
+
+/** \internal \returns the ceil of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); }
+
 /***************************************************************************
 * The following functions might not have to be overwritten for vectorized types
 ***************************************************************************/
@@ -357,22 +450,22 @@ pmadd(const Packet&  a,
 { return padd(pmul(a, b),c); }

 /** \internal \returns a packet version of \a *from.
-  * If LoadMode equals #Aligned, \a from must be 16 bytes aligned */
-template<typename Packet, int LoadMode>
+  * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+template<typename Packet, int Alignment>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits<Packet>::type* from)
 {
-  if(LoadMode == Aligned)
+  if(Alignment >= unpacket_traits<Packet>::alignment)
    return pload<Packet>(from);
  else
    return ploadu<Packet>(from);
 }

 /** \internal copy the packet \a from to \a *to.
-  * If StoreMode equals #Aligned, \a to must be 16 bytes aligned */
-template<typename Scalar, typename Packet, int LoadMode>
+  * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+template<typename Scalar, typename Packet, int Alignment>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from)
 {
-  if(LoadMode == Aligned)
+  if(Alignment >= unpacket_traits<Packet>::alignment)
    pstore(to, from);
  else
    pstoreu(to, from);
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -14,7 +14,7 @@
 #define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME,FUNCTOR) \
  template<typename Derived> \
  inline const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> \
-  NAME(const Eigen::ArrayBase<Derived>& x) { \
+  (NAME)(const Eigen::ArrayBase<Derived>& x) { \
    return Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived>(x.derived()); \
  }

@@ -34,22 +34,36 @@
    } \
  };

-
 namespace Eigen
 {
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(real,scalar_real_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(imag,scalar_imag_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(conj,scalar_conjugate_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(inverse,scalar_inverse_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sin,scalar_sin_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cos,scalar_cos_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan,scalar_atan_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs2,scalar_abs2_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube,scalar_cube_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round,scalar_round_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(floor,scalar_floor_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ceil,scalar_ceil_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isnan,scalar_isnan_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op)
  
  template<typename Derived>
  inline const Eigen::CwiseUnaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar>, const Derived>
@@ -57,16 +71,46 @@ namespace Eigen
    return x.derived().pow(exponent);
  }

-  template<typename Derived>
-  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const Derived, const Derived>
-  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<Derived>& exponents) 
+  /** \returns an expression of the coefficient-wise power of \a x to the given array of \a exponents.
+    *
+    * This function computes the coefficient-wise power.
+    *
+    * Example: \include Cwise_array_power_array.cpp
+    * Output: \verbinclude Cwise_array_power_array.out
+    * 
+    * \sa ArrayBase::pow()
+    */
+  template<typename Derived,typename ExponentDerived>
+  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
+  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents) 
  {
-    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const Derived, const Derived>(
+    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
      x.derived(),
      exponents.derived()
    );
  }
  
+  /** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents.
+    *
+    * This function computes the coefficient-wise power between a scalar and an array of exponents.
+    * Beaware that the scalar type of the input scalar \a x and the exponents \a exponents must be the same.
+    *
+    * Example: \include Cwise_scalar_power_array.cpp
+    * Output: \verbinclude Cwise_scalar_power_array.out
+    * 
+    * \sa ArrayBase::pow()
+    */
+  template<typename Derived>
+  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const typename Derived::ConstantReturnType, const Derived>
+  pow(const typename Derived::Scalar& x, const Eigen::ArrayBase<Derived>& exponents) 
+  {
+    typename Derived::ConstantReturnType constant_x(exponents.rows(), exponents.cols(), x);
+    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const typename Derived::ConstantReturnType, const Derived>(
+      constant_x,
+      exponents.derived()
+    );
+  }
+  
  /**
  * \brief Component-wise division of a scalar by array elements.
  **/
--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h
@@ -47,7 +47,7 @@ class Inverse : public InverseImpl<XprType,typename internal::traits<XprType>::S
 public:
  typedef typename XprType::StorageIndex StorageIndex;
  typedef typename XprType::PlainObject                       PlainObject;
-  typedef typename internal::nested<XprType>::type            XprTypeNested;
+  typedef typename internal::ref_selector<XprType>::type      XprTypeNested;
  typedef typename internal::remove_all<XprTypeNested>::type  XprTypeNestedCleaned;
  
  explicit Inverse(const XprType &xpr)
@@ -100,14 +100,11 @@ namespace internal {
  */
 template<typename ArgType>
 struct unary_evaluator<Inverse<ArgType> >
-  : public evaluator<typename Inverse<ArgType>::PlainObject>::type
+  : public evaluator<typename Inverse<ArgType>::PlainObject>
 {
  typedef Inverse<ArgType> InverseType;
  typedef typename InverseType::PlainObject PlainObject;
-  typedef typename evaluator<PlainObject>::type Base;
-  
-  typedef evaluator<InverseType> type;
-  typedef evaluator<InverseType> nestedType;
+  typedef evaluator<PlainObject> Base;
  
  enum { Flags = Base::Flags | EvalBeforeNestingBit };

--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -19,7 +19,7 @@ namespace Eigen {
  * \brief A matrix or vector expression mapping an existing array of data.
  *
  * \tparam PlainObjectType the equivalent matrix type of the mapped data
-  * \tparam MapOptions specifies whether the pointer is \c #Aligned, or \c #Unaligned.
+  * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
  *                The default is \c #Unaligned.
  * \tparam StrideType optionally specifies strides. By default, Map assumes the memory layout
  *                   of an ordinary, contiguous array. This can be overridden by specifying strides.
@@ -77,7 +77,7 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
                             ? int(PlainObjectType::OuterStrideAtCompileTime)
                             : int(StrideType::OuterStrideAtCompileTime),
-    IsAligned = bool(EIGEN_ALIGN) && ((int(MapOptions)&Aligned)==Aligned),
+    Alignment = int(MapOptions)&int(AlignedMask),
    Flags0 = TraitsBase::Flags & (~NestByRefBit),
    Flags = is_lvalue<PlainObjectType>::value ? int(Flags0) : (int(Flags0) & ~LvalueBit)
  };
@@ -117,11 +117,11 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
    /** Constructor in the fixed-size case.
      *
      * \param dataPtr pointer to the array to map
-      * \param a_stride optional Stride object, passing the strides.
+      * \param stride optional Stride object, passing the strides.
      */
    EIGEN_DEVICE_FUNC
-    explicit inline Map(PointerArgType dataPtr, const StrideType& a_stride = StrideType())
-      : Base(cast_to_pointer_type(dataPtr)), m_stride(a_stride)
+    explicit inline Map(PointerArgType dataPtr, const StrideType& stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr)), m_stride(stride)
    {
      PlainObjectType::Base::_check_template_params();
    }
@@ -129,12 +129,12 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
    /** Constructor in the dynamic-size vector case.
      *
      * \param dataPtr pointer to the array to map
-      * \param a_size the size of the vector expression
-      * \param a_stride optional Stride object, passing the strides.
+      * \param size the size of the vector expression
+      * \param stride optional Stride object, passing the strides.
      */
    EIGEN_DEVICE_FUNC
-    inline Map(PointerArgType dataPtr, Index a_size, const StrideType& a_stride = StrideType())
-      : Base(cast_to_pointer_type(dataPtr), a_size), m_stride(a_stride)
+    inline Map(PointerArgType dataPtr, Index size, const StrideType& stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr), size), m_stride(stride)
    {
      PlainObjectType::Base::_check_template_params();
    }
@@ -142,13 +142,13 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
    /** Constructor in the dynamic-size matrix case.
      *
      * \param dataPtr pointer to the array to map
-      * \param nbRows the number of rows of the matrix expression
-      * \param nbCols the number of columns of the matrix expression
-      * \param a_stride optional Stride object, passing the strides.
+      * \param rows the number of rows of the matrix expression
+      * \param cols the number of columns of the matrix expression
+      * \param stride optional Stride object, passing the strides.
      */
    EIGEN_DEVICE_FUNC
-    inline Map(PointerArgType dataPtr, Index nbRows, Index nbCols, const StrideType& a_stride = StrideType())
-      : Base(cast_to_pointer_type(dataPtr), nbRows, nbCols), m_stride(a_stride)
+    inline Map(PointerArgType dataPtr, Index rows, Index cols, const StrideType& stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr), rows, cols), m_stride(stride)
    {
      PlainObjectType::Base::_check_template_params();
    }
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -146,12 +146,12 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
    }

    EIGEN_DEVICE_FUNC
-    inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols)
-            : m_data(dataPtr), m_rows(nbRows), m_cols(nbCols)
+    inline MapBase(PointerType dataPtr, Index rows, Index cols)
+            : m_data(dataPtr), m_rows(rows), m_cols(cols)
    {
      eigen_assert( (dataPtr == 0)
-              || (   nbRows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == nbRows)
-                  && nbCols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == nbCols)));
+              || (   rows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
+                  && cols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols)));
      checkSanity();
    }

@@ -160,7 +160,9 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
    EIGEN_DEVICE_FUNC
    void checkSanity() const
    {
-      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::IsAligned, (size_t(m_data) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned");
+#if EIGEN_MAX_ALIGN_BYTES>0
+      eigen_assert(((size_t(m_data) % EIGEN_PLAIN_ENUM_MAX(1,internal::traits<Derived>::Alignment)) == 0) && "data is not aligned");
+#endif
    }

    PointerType m_data;
@@ -234,7 +236,7 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>

    EIGEN_DEVICE_FUNC explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {}
    EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {}
-    EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols) : Base(dataPtr, nbRows, nbCols) {}
+    EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index rows, Index cols) : Base(dataPtr, rows, cols) {}

    EIGEN_DEVICE_FUNC
    Derived& operator=(const MapBase& other)
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -10,6 +10,9 @@
 #ifndef EIGEN_MATHFUNCTIONS_H
 #define EIGEN_MATHFUNCTIONS_H

+// source: http://www.geom.uiuc.edu/~huberty/math5337/groupe/digits.html
+#define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406
+
 namespace Eigen {

 // On WINCE, std::abs is defined for int only, so let's defined our own overloads:
@@ -276,7 +279,7 @@ struct norm1_default_impl
  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
-    using std::abs;
+    EIGEN_USING_STD_MATH(abs);
    return abs(real(x)) + abs(imag(x));
  }
 };
@@ -287,7 +290,7 @@ struct norm1_default_impl<Scalar, false>
  EIGEN_DEVICE_FUNC
  static inline Scalar run(const Scalar& x)
  {
-    using std::abs;
+    EIGEN_USING_STD_MATH(abs);
    return abs(x);
  }
 };
@@ -313,8 +316,8 @@ struct hypot_impl
  {
    EIGEN_USING_STD_MATH(max);
    EIGEN_USING_STD_MATH(min);
-    using std::abs;
-    using std::sqrt;
+    EIGEN_USING_STD_MATH(abs);
+    EIGEN_USING_STD_MATH(sqrt);
    RealScalar _x = abs(x);
    RealScalar _y = abs(y);
    Scalar p, qp;
@@ -328,6 +331,7 @@ struct hypot_impl
      p = _y;
      qp = _x / p;
    }
+    if(p==RealScalar(0)) return RealScalar(0);
    return p * sqrt(RealScalar(1) + qp*qp);
  }
 };
@@ -345,6 +349,7 @@ struct hypot_retval
 template<typename OldType, typename NewType>
 struct cast_impl
 {
+  EIGEN_DEVICE_FUNC
  static inline NewType run(const OldType& x)
  {
    return static_cast<NewType>(x);
@@ -354,35 +359,119 @@ struct cast_impl
 // here, for once, we're plainly returning NewType: we don't want cast to do weird things.

 template<typename OldType, typename NewType>
+EIGEN_DEVICE_FUNC
 inline NewType cast(const OldType& x)
 {
  return cast_impl<OldType, NewType>::run(x);
 }

 /****************************************************************************
-* Implementation of logp1                                                *
+* Implementation of round                                                   *
 ****************************************************************************/

+#if EIGEN_HAS_CXX11_MATH
+  template<typename Scalar>
+  struct round_impl {
+    static inline Scalar run(const Scalar& x)
+    {
+      EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
+      using std::round;
+      return round(x);
+    }
+  };
+#else
+  template<typename Scalar>
+  struct round_impl
+  {
+    static inline Scalar run(const Scalar& x)
+    {
+      EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
+      EIGEN_USING_STD_MATH(floor);
+      EIGEN_USING_STD_MATH(ceil);
+      return (x > Scalar(0)) ? floor(x + Scalar(0.5)) : ceil(x - Scalar(0.5));
+    }
+  };
+#endif
+
 template<typename Scalar>
+struct round_retval
+{
+  typedef Scalar type;
+};
+
+/****************************************************************************
+* Implementation of arg                                                     *
+****************************************************************************/
+
+#if EIGEN_HAS_CXX11_MATH
+  template<typename Scalar>
+  struct arg_impl {
+    static inline Scalar run(const Scalar& x)
+    {
+      EIGEN_USING_STD_MATH(arg);
+      return arg(x);
+    }
+  };
+#else
+  template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+  struct arg_default_impl
+  {
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    EIGEN_DEVICE_FUNC
+    static inline RealScalar run(const Scalar& x)
+    {
+      return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0); }
+  };
+
+  template<typename Scalar>
+  struct arg_default_impl<Scalar,true>
+  {
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    EIGEN_DEVICE_FUNC
+    static inline RealScalar run(const Scalar& x)
+    {
+      EIGEN_USING_STD_MATH(arg);
+      return arg(x);
+    }
+  };
+
+  template<typename Scalar> struct arg_impl : arg_default_impl<Scalar> {};
+#endif
+
+template<typename Scalar>
+struct arg_retval
+{
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
+/****************************************************************************
+* Implementation of log1p                                                   *
+****************************************************************************/
+template<typename Scalar, bool isComplex = NumTraits<Scalar>::IsComplex >
 struct log1p_impl
 {
  static inline Scalar run(const Scalar& x)
  {
    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-    // Let's be conservative and enable the default C++11 implementation only if we are sure it exists
-    #if (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC)  \
-        && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC)
-      using std::log1p;
-      return log1p(x);
-    #else
-      typedef typename NumTraits<Scalar>::Real RealScalar;
-      using std::log;
-      Scalar x1p = RealScalar(1) + x;
-      return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
-    #endif
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    EIGEN_USING_STD_MATH(log);
+    Scalar x1p = RealScalar(1) + x;
+    return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
  }
 };

+#if EIGEN_HAS_CXX11_MATH
+template<typename Scalar>
+struct log1p_impl<Scalar, false> {
+  static inline Scalar run(const Scalar& x)
+  {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+    using std::log1p;
+    return log1p(x);
+  }
+};
+#endif
+
 template<typename Scalar>
 struct log1p_retval
 {
@@ -399,7 +488,7 @@ struct pow_default_impl
  typedef Scalar retval;
  static inline Scalar run(const Scalar& x, const Scalar& y)
  {
-    using std::pow;
+    EIGEN_USING_STD_MATH(pow);
    return pow(x, y);
  }
 };
@@ -467,48 +556,48 @@ struct random_default_impl<Scalar, false, false>
 };

 enum {
-  floor_log2_terminate,
-  floor_log2_move_up,
-  floor_log2_move_down,
-  floor_log2_bogus
+  meta_floor_log2_terminate,
+  meta_floor_log2_move_up,
+  meta_floor_log2_move_down,
+  meta_floor_log2_bogus
 };

-template<unsigned int n, int lower, int upper> struct floor_log2_selector
+template<unsigned int n, int lower, int upper> struct meta_floor_log2_selector
 {
  enum { middle = (lower + upper) / 2,
-         value = (upper <= lower + 1) ? int(floor_log2_terminate)
-               : (n < (1 << middle)) ? int(floor_log2_move_down)
-               : (n==0) ? int(floor_log2_bogus)
-               : int(floor_log2_move_up)
+         value = (upper <= lower + 1) ? int(meta_floor_log2_terminate)
+               : (n < (1 << middle)) ? int(meta_floor_log2_move_down)
+               : (n==0) ? int(meta_floor_log2_bogus)
+               : int(meta_floor_log2_move_up)
  };
 };

 template<unsigned int n,
         int lower = 0,
         int upper = sizeof(unsigned int) * CHAR_BIT - 1,
-         int selector = floor_log2_selector<n, lower, upper>::value>
-struct floor_log2 {};
+         int selector = meta_floor_log2_selector<n, lower, upper>::value>
+struct meta_floor_log2 {};

 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_move_down>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_down>
 {
-  enum { value = floor_log2<n, lower, floor_log2_selector<n, lower, upper>::middle>::value };
+  enum { value = meta_floor_log2<n, lower, meta_floor_log2_selector<n, lower, upper>::middle>::value };
 };

 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_move_up>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_up>
 {
-  enum { value = floor_log2<n, floor_log2_selector<n, lower, upper>::middle, upper>::value };
+  enum { value = meta_floor_log2<n, meta_floor_log2_selector<n, lower, upper>::middle, upper>::value };
 };

 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_terminate>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_terminate>
 {
  enum { value = (n >= ((unsigned int)(1) << (lower+1))) ? lower+1 : lower };
 };

 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_bogus>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_bogus>
 {
  // no value, error at compile time
 };
@@ -516,11 +605,24 @@ struct floor_log2<n, lower, upper, floor_log2_bogus>
 template<typename Scalar>
 struct random_default_impl<Scalar, false, true>
 {
-  typedef typename NumTraits<Scalar>::NonInteger NonInteger;
-
  static inline Scalar run(const Scalar& x, const Scalar& y)
-  {
-    return x + Scalar((NonInteger(y)-x+1) * std::rand() / (RAND_MAX + NonInteger(1)));
+  { 
+    using std::max;
+    using std::min;
+    typedef typename conditional<NumTraits<Scalar>::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX;
+    if(y<x)
+      return x;
+    std::size_t range = ScalarX(y)-ScalarX(x);
+    std::size_t offset = 0;
+    // rejection sampling
+    std::size_t divisor    = (range+RAND_MAX-1)/(range+1);
+    std::size_t multiplier = (range+RAND_MAX-1)/std::size_t(RAND_MAX);
+
+    do {
+      offset = ( (std::size_t(std::rand()) * multiplier) / divisor );
+    } while (offset > range);
+
+    return Scalar(ScalarX(x) + offset);
  }

  static inline Scalar run()
@@ -528,7 +630,7 @@ struct random_default_impl<Scalar, false, true>
 #ifdef EIGEN_MAKING_DOCS
    return run(Scalar(NumTraits<Scalar>::IsSigned ? -10 : 0), Scalar(10));
 #else
-    enum { rand_bits = floor_log2<(unsigned int)(RAND_MAX)+1>::value,
+    enum { rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX)+1>::value,
           scalar_bits = sizeof(Scalar) * CHAR_BIT,
           shift = EIGEN_PLAIN_ENUM_MAX(0, int(rand_bits) - int(scalar_bits)),
           offset = NumTraits<Scalar>::IsSigned ? (1 << (EIGEN_PLAIN_ENUM_MIN(rand_bits,scalar_bits)-1)) : 0
@@ -568,14 +670,15 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random()
 } // end namespace internal

 /****************************************************************************
-* Generic math function                                                    *
+* Generic math functions                                                    *
 ****************************************************************************/

 namespace numext {
-  
+
+#ifndef __CUDA_ARCH__
 template<typename T>
 EIGEN_DEVICE_FUNC
-inline T mini(const T& x, const T& y)
+EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
 {
  EIGEN_USING_STD_MATH(min);
  return min EIGEN_NOT_A_MACRO (x,y);
@@ -583,11 +686,38 @@ inline T mini(const T& x, const T& y)

 template<typename T>
 EIGEN_DEVICE_FUNC
-inline T maxi(const T& x, const T& y)
+EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
 {
  EIGEN_USING_STD_MATH(max);
  return max EIGEN_NOT_A_MACRO (x,y);
 }
+#else
+template<typename T>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
+{
+  return y < x ? y : x;
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
+{
+  return fmin(x, y);
+}
+template<typename T>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
+{
+  return x < y ? y : x;
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
+{
+  return fmax(x, y);
+}
+#endif
+

 template<typename Scalar>
 EIGEN_DEVICE_FUNC
@@ -617,6 +747,13 @@ inline EIGEN_MATHFUNC_RETVAL(imag, Scalar) imag(const Scalar& x)
  return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x);
 }

+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(arg, Scalar) arg(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(arg, Scalar)::run(x);
+}
+
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
 inline typename internal::add_const_on_value_type< EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) >::type imag_ref(const Scalar& x)
@@ -673,22 +810,81 @@ inline EIGEN_MATHFUNC_RETVAL(pow, Scalar) pow(const Scalar& x, const Scalar& y)
  return EIGEN_MATHFUNC_IMPL(pow, Scalar)::run(x, y);
 }

-// std::isfinite is non standard, so let's define our own version,
-// even though it is not very efficient.
 template<typename T>
 EIGEN_DEVICE_FUNC
 bool (isfinite)(const T& x)
 {
-  return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest();
+  #if EIGEN_HAS_CXX11_MATH
+    using std::isfinite;
+    return isfinite EIGEN_NOT_A_MACRO (x);
+  #else
+    return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest();
+  #endif
 }

 template<typename T>
 EIGEN_DEVICE_FUNC
+bool (isnan)(const T& x)
+{
+  #if EIGEN_HAS_CXX11_MATH
+    using std::isnan;
+    return isnan EIGEN_NOT_A_MACRO (x);
+  #else
+    return x != x;
+  #endif
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+bool (isinf)(const T& x)
+{
+  #if EIGEN_HAS_CXX11_MATH
+    using std::isinf;
+    return isinf EIGEN_NOT_A_MACRO (x);
+  #else
+    return x>NumTraits<T>::highest() || x<NumTraits<T>::lowest();
+  #endif
+}
+
+template<typename T>
 bool (isfinite)(const std::complex<T>& x)
 {
-  using std::real;
-  using std::imag;
-  return isfinite(real(x)) && isfinite(imag(x));
+  return (numext::isfinite)(numext::real(x)) && (numext::isfinite)(numext::imag(x));
+}
+
+template<typename T>
+bool (isnan)(const std::complex<T>& x)
+{
+  return (numext::isnan)(numext::real(x)) || (numext::isnan)(numext::imag(x));
+}
+
+template<typename T>
+bool (isinf)(const std::complex<T>& x)
+{
+  return ((numext::isinf)(numext::real(x)) || (numext::isinf)(numext::imag(x))) && (!(numext::isnan)(x));
+}
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(round, Scalar)::run(x);
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+T (floor)(const T& x)
+{
+  EIGEN_USING_STD_MATH(floor);
+  return floor(x);
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+T (ceil)(const T& x)
+{
+  EIGEN_USING_STD_MATH(ceil);
+  return ceil(x);
 }

 // Log base 2 for 32 bits positive integers.
@@ -726,14 +922,14 @@ struct scalar_fuzzy_default_impl<Scalar, false, false>
  template<typename OtherScalar> EIGEN_DEVICE_FUNC
  static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
  {
-    using std::abs;
+    EIGEN_USING_STD_MATH(abs);
    return abs(x) <= abs(y) * prec;
  }
  EIGEN_DEVICE_FUNC
  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
  {
    EIGEN_USING_STD_MATH(min);
-    using std::abs;
+    EIGEN_USING_STD_MATH(abs);
    return abs(x - y) <= (min)(abs(x), abs(y)) * prec;
  }
  EIGEN_DEVICE_FUNC
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -24,13 +24,13 @@ namespace Eigen {
  * The %Matrix class encompasses \em both fixed-size and dynamic-size objects (\ref fixedsize "note").
  *
  * The first three template parameters are required:
-  * \tparam _Scalar \anchor matrix_tparam_scalar Numeric type, e.g. float, double, int or std::complex<float>.
-  *                 User defined sclar types are supported as well (see \ref user_defined_scalars "here").
+  * \tparam _Scalar Numeric type, e.g. float, double, int or std::complex<float>.
+  *                 User defined scalar types are supported as well (see \ref user_defined_scalars "here").
  * \tparam _Rows Number of rows, or \b Dynamic
  * \tparam _Cols Number of columns, or \b Dynamic
  *
  * The remaining template parameters are optional -- in most cases you don't have to worry about them.
-  * \tparam _Options \anchor matrix_tparam_options A combination of either \b #RowMajor or \b #ColMajor, and of either
+  * \tparam _Options A combination of either \b #RowMajor or \b #ColMajor, and of either
  *                 \b #AutoAlign or \b #DontAlign.
  *                 The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required
  *                 for vectorization. It defaults to aligning matrices except for fixed sizes that aren't a multiple of the packet size.
@@ -97,6 +97,40 @@ namespace Eigen {
  * are the dimensions of the original matrix, while _Rows and _Cols are Dynamic.</dd>
  * </dl>
  *
+  * <i><b>ABI and storage layout</b></i>
+  * 
+  * The table below summarizes the ABI of some possible Matrix instances which is fixed thorough the lifetime of Eigen 3.
+  * <table  class="manual">
+  * <tr><th>Matrix type</th><th>Equivalent C structure</th></tr>
+  * <tr><td>\code Matrix<T,Dynamic,Dynamic> \endcode</td><td>\code
+  * struct {
+  *   T *data;                  // with (size_t(data)%EIGEN_MAX_ALIGN_BYTES)==0
+  *   Eigen::Index rows, cols;
+  *  };
+  * \endcode</td></tr>
+  * <tr class="alt"><td>\code
+  * Matrix<T,Dynamic,1>
+  * Matrix<T,1,Dynamic> \endcode</td><td>\code
+  * struct {
+  *   T *data;                  // with (size_t(data)%EIGEN_MAX_ALIGN_BYTES)==0
+  *   Eigen::Index size;
+  *  };
+  * \endcode</td></tr>
+  * <tr><td>\code Matrix<T,Rows,Cols> \endcode</td><td>\code
+  * struct {
+  *   T data[Rows*Cols];        // with (size_t(data)%A(Rows*Cols*sizeof(T)))==0
+  *  };
+  * \endcode</td></tr>
+  * <tr class="alt"><td>\code Matrix<T,Dynamic,Dynamic,0,MaxRows,MaxCols> \endcode</td><td>\code
+  * struct {
+  *   T data[MaxRows*MaxCols];  // with (size_t(data)%A(MaxRows*MaxCols*sizeof(T)))==0
+  *   Eigen::Index rows, cols;
+  *  };
+  * \endcode</td></tr>
+  * </table>
+  * Note that in this table Rows, Cols, MaxRows and MaxCols are all positive integers. A(S) is defined to the largest possible power-of-two
+  * smaller to EIGEN_MAX_STATIC_ALIGN_BYTES.
+  * 
  * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy, 
  * \ref TopicStorageOrders 
  */
@@ -105,6 +139,20 @@ namespace internal {
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
 struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
 {
+private:
+  enum { size = internal::size_at_compile_time<_Rows,_Cols>::ret };
+  typedef typename find_best_packet<_Scalar,size>::type PacketScalar;
+  enum {
+      row_major_bit = _Options&RowMajor ? RowMajorBit : 0,
+      is_dynamic_size_storage = _MaxRows==Dynamic || _MaxCols==Dynamic,
+      max_size = is_dynamic_size_storage ? Dynamic : _MaxRows*_MaxCols,
+      default_alignment = compute_default_alignment<_Scalar,max_size>::value,
+      actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0,
+      required_alignment = unpacket_traits<PacketScalar>::alignment,
+      packet_access_bit = packet_traits<_Scalar>::Vectorizable && (actual_alignment>=required_alignment) ? PacketAccessBit : 0
+    };
+    
+public:
  typedef _Scalar Scalar;
  typedef Dense StorageKind;
  typedef Eigen::Index StorageIndex;
@@ -115,11 +163,13 @@ struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
    MaxRowsAtCompileTime = _MaxRows,
    MaxColsAtCompileTime = _MaxCols,
    Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
-    // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase
-    EvaluatorFlags = compute_matrix_evaluator_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
    Options = _Options,
    InnerStrideAtCompileTime = 1,
-    OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime
+    OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime,
+    
+    // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase
+    EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit,
+    Alignment = actual_alignment
  };
 };
 }
@@ -170,7 +220,7 @@ class Matrix
      */
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Matrix& operator=(const MatrixBase<OtherDerived>& other)
+    EIGEN_STRONG_INLINE Matrix& operator=(const DenseBase<OtherDerived>& other)
    {
      return Base::_set(other);
    }
@@ -219,6 +269,7 @@ class Matrix
    { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }

 #ifdef EIGEN_HAVE_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
    Matrix(Matrix&& other)
      : Base(std::move(other))
    {
@@ -226,6 +277,7 @@ class Matrix
      if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
        Base::_set_noalias(other);
    }
+    EIGEN_DEVICE_FUNC
    Matrix& operator=(Matrix&& other)
    {
      other.swap(*this);
@@ -264,8 +316,8 @@ class Matrix
      * 
      * \warning This constructor is disabled for fixed-size \c 1x1 matrices. For instance,
      * calling Matrix<double,1,1>(1) will call the initialization constructor: Matrix(const Scalar&).
-      * For fixed-size \c 1x1 matrices it is thefore recommended to use the default
-      * constructor Matrix() instead, especilly when using one of the non standard
+      * For fixed-size \c 1x1 matrices it is therefore recommended to use the default
+      * constructor Matrix() instead, especially when using one of the non standard
      * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
      */
    EIGEN_STRONG_INLINE explicit Matrix(Index dim);
@@ -279,8 +331,8 @@ class Matrix
      * 
      * \warning This constructor is disabled for fixed-size \c 1x2 and \c 2x1 vectors. For instance,
      * calling Matrix2f(2,1) will call the initialization constructor: Matrix(const Scalar& x, const Scalar& y).
-      * For fixed-size \c 1x2 or \c 2x1 vectors it is thefore recommended to use the default
-      * constructor Matrix() instead, especilly when using one of the non standard
+      * For fixed-size \c 1x2 or \c 2x1 vectors it is therefore recommended to use the default
+      * constructor Matrix() instead, especially when using one of the non standard
      * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
      */
    EIGEN_DEVICE_FUNC
@@ -313,37 +365,10 @@ class Matrix
    }


-    /** \brief Constructor copying the value of the expression \a other */
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Matrix(const MatrixBase<OtherDerived>& other)
-             : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      // This test resides here, to bring the error messages closer to the user. Normally, these checks
-      // are performed deeply within the library, thus causing long and scary error traces.
-      EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
    /** \brief Copy constructor */
    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Matrix(const Matrix& other)
-            : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
-    /** \brief Copy constructor with in-place evaluation */
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Matrix(const ReturnByValue<OtherDerived>& other)
-    {
-      Base::_check_template_params();
-      Base::resize(other.rows(), other.cols());
-      other.evalTo(*this);
-    }
+    EIGEN_STRONG_INLINE Matrix(const Matrix& other) : Base(other)
+    { }

    /** \brief Copy constructor for generic expressions.
      * \sa MatrixBase::operator=(const EigenBase<OtherDerived>&)
@@ -351,14 +376,8 @@ class Matrix
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Matrix(const EigenBase<OtherDerived> &other)
-      : Base(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
-    {
-      Base::_check_template_params();
-      Base::_resize_to_match(other);
-      // FIXME/CHECK: isn't *this = other.derived() more efficient. it allows to
-      //              go for pure _set() implementations, right?
-      *this = other;
-    }
+      : Base(other.derived())
+    { }

    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -81,6 +81,7 @@ template<typename Derived> class MatrixBase
    using Base::operator*=;
    using Base::operator/=;
    using Base::operator*;
+    using Base::operator/;

    typedef typename Base::CoeffReturnType CoeffReturnType;
    typedef typename Base::ConstTransposeReturnType ConstTransposeReturnType;
@@ -101,23 +102,11 @@ template<typename Derived> class MatrixBase
    EIGEN_DEVICE_FUNC
    inline Index diagonalSize() const { return (std::min)(rows(),cols()); }

-    /** \brief The plain matrix type corresponding to this expression.
-      *
-      * This is not necessarily exactly the return type of eval(). In the case of plain matrices,
-      * the return type of eval() is a const reference to a matrix, not a matrix! It is however guaranteed
-      * that the return type of eval() is either PlainObject or const PlainObject&.
-      */
-    typedef Matrix<typename internal::traits<Derived>::Scalar,
-                internal::traits<Derived>::RowsAtCompileTime,
-                internal::traits<Derived>::ColsAtCompileTime,
-                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
-                internal::traits<Derived>::MaxRowsAtCompileTime,
-                internal::traits<Derived>::MaxColsAtCompileTime
-          > PlainObject;
+    typedef typename Base::PlainObject PlainObject;

 #ifndef EIGEN_PARSED_BY_DOXYGEN
    /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
    /** \internal the return type of MatrixBase::adjoint() */
    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
                        CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, ConstTransposeReturnType>,
@@ -126,7 +115,7 @@ template<typename Derived> class MatrixBase
    /** \internal Return type of eigenvalues() */
    typedef Matrix<std::complex<RealScalar>, internal::traits<Derived>::ColsAtCompileTime, 1, ColMajor> EigenvaluesReturnType;
    /** \internal the return type of identity */
-    typedef CwiseNullaryOp<internal::scalar_identity_op<Scalar>,Derived> IdentityReturnType;
+    typedef CwiseNullaryOp<internal::scalar_identity_op<Scalar>,PlainObject> IdentityReturnType;
    /** \internal the return type of unit vectors */
    typedef Block<const CwiseNullaryOp<internal::scalar_identity_op<Scalar>, SquareMatrixType>,
                  internal::traits<Derived>::RowsAtCompileTime,
@@ -164,12 +153,6 @@ template<typename Derived> class MatrixBase
    EIGEN_DEVICE_FUNC
    Derived& operator=(const ReturnByValue<OtherDerived>& other);

-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_DEVICE_FUNC
-    Derived& lazyAssign(const ProductBase<ProductDerived, Lhs,Rhs>& other);
-#endif // not EIGEN_PARSED_BY_DOXYGEN
-
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
    Derived& operator+=(const MatrixBase<OtherDerived>& other);
@@ -329,7 +312,7 @@ template<typename Derived> class MatrixBase
    template<bool Enable> inline const Derived& forceAlignedAccessIf() const { return derived(); }
    template<bool Enable> inline Derived& forceAlignedAccessIf() { return derived(); }

-    Scalar trace() const;
+    EIGEN_DEVICE_FUNC Scalar trace() const;

    template<int p> EIGEN_DEVICE_FUNC RealScalar lpNorm() const;

@@ -412,7 +395,8 @@ template<typename Derived> class MatrixBase
    
    ScalarMultipleReturnType operator*(const UniformScaling<Scalar>& s) const;
    // put this as separate enum value to work around possible GCC 4.3 bug (?)
-    enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1?Vertical:Horizontal };
+    enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1&&RowsAtCompileTime==1 ? ((internal::traits<Derived>::Flags&RowMajorBit)==RowMajorBit ? Horizontal : Vertical)
+                                          : ColsAtCompileTime==1 ? Vertical : Horizontal };
    typedef Homogeneous<Derived, HomogeneousReturnTypeDirection> HomogeneousReturnType;
    HomogeneousReturnType homogeneous() const;
    
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -83,8 +83,25 @@ template<typename T> struct GenericNumTraits
    // make sure to override this for floating-point types
    return Real(0);
  }
-  static inline T highest() { return (std::numeric_limits<T>::max)(); }
-  static inline T lowest()  { return IsInteger ? (std::numeric_limits<T>::min)() : (-(std::numeric_limits<T>::max)()); }
+
+
+  EIGEN_DEVICE_FUNC
+  static inline T highest() {
+#if defined(__CUDA_ARCH__)
+    return (internal::device::numeric_limits<T>::max)();
+#else
+    return (std::numeric_limits<T>::max)();
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC
+  static inline T lowest()  {
+#if defined(__CUDA_ARCH__)
+    return IsInteger ? (internal::device::numeric_limits<T>::min)() : (-(internal::device::numeric_limits<T>::max)());
+#else
+    return IsInteger ? (std::numeric_limits<T>::min)() : (-(std::numeric_limits<T>::max)());
+#endif
+  }
 };

 template<typename T> struct NumTraits : GenericNumTraits<T>
--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@@ -42,10 +42,6 @@ namespace Eigen {

 namespace internal {

-template<typename PermutationType, typename MatrixType, int Side, bool Transposed=false>
-struct permut_matrix_product_retval;
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed=false>
-struct permut_sparsematrix_product_retval;
 enum PermPermProduct_t {PermPermProduct};

 } // end namespace internal
@@ -353,7 +349,7 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
      * array's size.
      */
    template<typename Other>
-    explicit inline PermutationMatrix(const MatrixBase<Other>& a_indices) : m_indices(a_indices)
+    explicit inline PermutationMatrix(const MatrixBase<Other>& indices) : m_indices(indices)
    {}

    /** Convert the Transpositions \a tr to a permutation matrix */
@@ -401,12 +397,12 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
 #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename Other>
    PermutationMatrix(const Transpose<PermutationBase<Other> >& other)
-      : m_indices(other.nestedPermutation().size())
+      : m_indices(other.nestedExpression().size())
    {
      eigen_internal_assert(m_indices.size() <= NumTraits<StorageIndex>::highest());
      StorageIndex end = StorageIndex(m_indices.size());
      for (StorageIndex i=0; i<end;++i)
-        m_indices.coeffRef(other.nestedPermutation().indices().coeff(i)) = i;
+        m_indices.coeffRef(other.nestedExpression().indices().coeff(i)) = i;
    }
    template<typename Lhs,typename Rhs>
    PermutationMatrix(internal::PermPermProduct_t, const Lhs& lhs, const Rhs& rhs)
@@ -527,8 +523,8 @@ class PermutationWrapper : public PermutationBase<PermutationWrapper<_IndicesTyp
    typedef typename Traits::IndicesType IndicesType;
    #endif

-    inline PermutationWrapper(const IndicesType& a_indices)
-      : m_indices(a_indices)
+    inline PermutationWrapper(const IndicesType& indices)
+      : m_indices(indices)
    {}

    /** const version of indices(). */
@@ -541,18 +537,15 @@ class PermutationWrapper : public PermutationBase<PermutationWrapper<_IndicesTyp
 };


-// TODO: Do we need to define these operator* functions? Would it be better to have them inherited
-// from MatrixBase?
-
 /** \returns the matrix with the permutation applied to the columns.
  */
 template<typename MatrixDerived, typename PermutationDerived>
 EIGEN_DEVICE_FUNC
-const Product<MatrixDerived, PermutationDerived, DefaultProduct>
+const Product<MatrixDerived, PermutationDerived, AliasFreeProduct>
 operator*(const MatrixBase<MatrixDerived> &matrix,
          const PermutationBase<PermutationDerived>& permutation)
 {
-  return Product<MatrixDerived, PermutationDerived, DefaultProduct>
+  return Product<MatrixDerived, PermutationDerived, AliasFreeProduct>
            (matrix.derived(), permutation.derived());
 }

@@ -560,89 +553,16 @@ operator*(const MatrixBase<MatrixDerived> &matrix,
  */
 template<typename PermutationDerived, typename MatrixDerived>
 EIGEN_DEVICE_FUNC
-const Product<PermutationDerived, MatrixDerived, DefaultProduct>
+const Product<PermutationDerived, MatrixDerived, AliasFreeProduct>
 operator*(const PermutationBase<PermutationDerived> &permutation,
          const MatrixBase<MatrixDerived>& matrix)
 {
-  return Product<PermutationDerived, MatrixDerived, DefaultProduct>
+  return Product<PermutationDerived, MatrixDerived, AliasFreeProduct>
            (permutation.derived(), matrix.derived());
 }

 namespace internal {

-template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
-struct traits<permut_matrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
-  : traits<typename MatrixType::PlainObject>
-{
-  typedef typename MatrixType::PlainObject ReturnType;
-};
-
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
-struct permut_matrix_product_retval
- : public ReturnByValue<permut_matrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
-{
-    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
-    typedef typename MatrixType::StorageIndex StorageIndex;
-
-    permut_matrix_product_retval(const PermutationType& perm, const MatrixType& matrix)
-      : m_permutation(perm), m_matrix(matrix)
-    {}
-
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-
-    template<typename Dest> inline void evalTo(Dest& dst) const
-    {
-      const Index n = Side==OnTheLeft ? rows() : cols();
-      // FIXME we need an is_same for expression that is not sensitive to constness. For instance
-      // is_same_xpr<Block<const Matrix>, Block<Matrix> >::value should be true.
-      if(is_same<MatrixTypeNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_matrix))
-      {
-        // apply the permutation inplace
-        Matrix<bool,PermutationType::RowsAtCompileTime,1,0,PermutationType::MaxRowsAtCompileTime> mask(m_permutation.size());
-        mask.fill(false);
-        Index r = 0;
-        while(r < m_permutation.size())
-        {
-          // search for the next seed
-          while(r<m_permutation.size() && mask[r]) r++;
-          if(r>=m_permutation.size())
-            break;
-          // we got one, let's follow it until we are back to the seed
-          Index k0 = r++;
-          Index kPrev = k0;
-          mask.coeffRef(k0) = true;
-          for(Index k=m_permutation.indices().coeff(k0); k!=k0; k=m_permutation.indices().coeff(k))
-          {
-                  Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>(dst, k)
-            .swap(Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
-                       (dst,((Side==OnTheLeft) ^ Transposed) ? k0 : kPrev));
-
-            mask.coeffRef(k) = true;
-            kPrev = k;
-          }
-        }
-      }
-      else
-      {
-        for(Index i = 0; i < n; ++i)
-        {
-          Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
-               (dst, ((Side==OnTheLeft) ^ Transposed) ? m_permutation.indices().coeff(i) : i)
-
-          =
-
-          Block<const MatrixTypeNestedCleaned,Side==OnTheLeft ? 1 : MatrixType::RowsAtCompileTime,Side==OnTheRight ? 1 : MatrixType::ColsAtCompileTime>
-               (m_matrix, ((Side==OnTheRight) ^ Transposed) ? m_permutation.indices().coeff(i) : i);
-        }
-      }
-    }
-
-  protected:
-    const PermutationType& m_permutation;
-    typename MatrixType::Nested m_matrix;
-};
-
 /* Template partial specialization for transposed/inverse permutations */

 template<typename Derived>
@@ -700,22 +620,22 @@ class Transpose<PermutationBase<Derived> >
    /** \returns the matrix with the inverse permutation applied to the columns.
      */
    template<typename OtherDerived> friend
-    const Product<OtherDerived, Transpose, DefaultProduct>
+    const Product<OtherDerived, Transpose, AliasFreeProduct>
    operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trPerm)
    {
-      return Product<OtherDerived, Transpose, DefaultProduct>(matrix.derived(), trPerm.derived());
+      return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trPerm.derived());
    }

    /** \returns the matrix with the inverse permutation applied to the rows.
      */
    template<typename OtherDerived>
-    const Product<Transpose, OtherDerived, DefaultProduct>
+    const Product<Transpose, OtherDerived, AliasFreeProduct>
    operator*(const MatrixBase<OtherDerived>& matrix) const
    {
-      return Product<Transpose, OtherDerived, DefaultProduct>(*this, matrix.derived());
+      return Product<Transpose, OtherDerived, AliasFreeProduct>(*this, matrix.derived());
    }

-    const PermutationType& nestedPermutation() const { return m_permutation; }
+    const PermutationType& nestedExpression() const { return m_permutation; }

  protected:
    const PermutationType& m_permutation;
@@ -728,32 +648,6 @@ const PermutationWrapper<const Derived> MatrixBase<Derived>::asPermutation() con
 }

 namespace internal {
-  
-// TODO currently a permutation matrix expression has the form PermutationMatrix or PermutationWrapper
-//      or their transpose; in the future shape should be defined by the expression traits
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-struct evaluator_traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType> >
-{
-  typedef typename storage_kind_to_evaluator_kind<Dense>::Kind Kind;
-  typedef PermutationShape Shape;
-  static const int AssumeAliasing = 0;
-};
-
-template<typename IndicesType>
-struct evaluator_traits<PermutationWrapper<IndicesType> >
-{
-  typedef typename storage_kind_to_evaluator_kind<Dense>::Kind Kind;
-  typedef PermutationShape Shape;
-  static const int AssumeAliasing = 0;
-};
-
-template<typename Derived>
-struct evaluator_traits<Transpose<PermutationBase<Derived> > >
-{
-  typedef typename storage_kind_to_evaluator_kind<Dense>::Kind Kind;
-  typedef PermutationShape Shape;
-  static const int AssumeAliasing = 0;
-};

 template<> struct AssignmentKind<DenseShape,PermutationShape> { typedef EigenBase2EigenBase Kind; };

--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -69,8 +69,9 @@ template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers> struct m
 #ifdef EIGEN_PARSED_BY_DOXYGEN
 namespace internal {

-// this is a warkaround to doxygen not being able to understand the inheritence logic
+// this is a workaround to doxygen not being able to understand the inheritance logic
 // when it is hidden by the dense_xpr_base helper struct.
+/** This class is just a workaround for Doxygen and it does not not actually exist. */
 template<typename Derived> struct dense_xpr_base_dispatcher_for_doxygen;// : public MatrixBase<Derived> {};
 /** This class is just a workaround for Doxygen and it does not not actually exist. */
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
@@ -96,6 +97,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type

    typedef typename internal::traits<Derived>::StorageKind StorageKind;
    typedef typename internal::traits<Derived>::Scalar Scalar;
+    
    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
    typedef typename NumTraits<Scalar>::Real RealScalar;
    typedef Derived DenseType;
@@ -114,20 +116,23 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    typedef Eigen::Map<Derived, Unaligned>  MapType;
    friend  class Eigen::Map<const Derived, Unaligned>;
    typedef const Eigen::Map<const Derived, Unaligned> ConstMapType;
-    friend  class Eigen::Map<Derived, Aligned>;
-    typedef Eigen::Map<Derived, Aligned> AlignedMapType;
-    friend  class Eigen::Map<const Derived, Aligned>;
-    typedef const Eigen::Map<const Derived, Aligned> ConstAlignedMapType;
+#if EIGEN_MAX_ALIGN_BYTES>0
+    // for EIGEN_MAX_ALIGN_BYTES==0, AlignedMax==Unaligned, and many compilers generate warnings for friend-ing a class twice.
+    friend  class Eigen::Map<Derived, AlignedMax>;
+    friend  class Eigen::Map<const Derived, AlignedMax>;
+#endif
+    typedef Eigen::Map<Derived, AlignedMax> AlignedMapType;
+    typedef const Eigen::Map<const Derived, AlignedMax> ConstAlignedMapType;
    template<typename StrideType> struct StridedMapType { typedef Eigen::Map<Derived, Unaligned, StrideType> type; };
    template<typename StrideType> struct StridedConstMapType { typedef Eigen::Map<const Derived, Unaligned, StrideType> type; };
-    template<typename StrideType> struct StridedAlignedMapType { typedef Eigen::Map<Derived, Aligned, StrideType> type; };
-    template<typename StrideType> struct StridedConstAlignedMapType { typedef Eigen::Map<const Derived, Aligned, StrideType> type; };
+    template<typename StrideType> struct StridedAlignedMapType { typedef Eigen::Map<Derived, AlignedMax, StrideType> type; };
+    template<typename StrideType> struct StridedConstAlignedMapType { typedef Eigen::Map<const Derived, AlignedMax, StrideType> type; };

  protected:
    DenseStorage<Scalar, Base::MaxSizeAtCompileTime, Base::RowsAtCompileTime, Base::ColsAtCompileTime, Options> m_storage;

  public:
-    enum { NeedsToAlign = SizeAtCompileTime != Dynamic && (internal::traits<Derived>::EvaluatorFlags & AlignedBit) != 0 };
+    enum { NeedsToAlign = (SizeAtCompileTime != Dynamic) && (internal::traits<Derived>::Alignment>0) };
    EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)

    EIGEN_DEVICE_FUNC
@@ -244,22 +249,22 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * \sa resize(Index) for vectors, resize(NoChange_t, Index), resize(Index, NoChange_t)
      */
    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void resize(Index nbRows, Index nbCols)
+    EIGEN_STRONG_INLINE void resize(Index rows, Index cols)
    {
-      eigen_assert(   EIGEN_IMPLIES(RowsAtCompileTime!=Dynamic,nbRows==RowsAtCompileTime)
-                   && EIGEN_IMPLIES(ColsAtCompileTime!=Dynamic,nbCols==ColsAtCompileTime)
-                   && EIGEN_IMPLIES(RowsAtCompileTime==Dynamic && MaxRowsAtCompileTime!=Dynamic,nbRows<=MaxRowsAtCompileTime)
-                   && EIGEN_IMPLIES(ColsAtCompileTime==Dynamic && MaxColsAtCompileTime!=Dynamic,nbCols<=MaxColsAtCompileTime)
-                   && nbRows>=0 && nbCols>=0 && "Invalid sizes when resizing a matrix or array.");
-      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(nbRows, nbCols);
+      eigen_assert(   EIGEN_IMPLIES(RowsAtCompileTime!=Dynamic,rows==RowsAtCompileTime)
+                   && EIGEN_IMPLIES(ColsAtCompileTime!=Dynamic,cols==ColsAtCompileTime)
+                   && EIGEN_IMPLIES(RowsAtCompileTime==Dynamic && MaxRowsAtCompileTime!=Dynamic,rows<=MaxRowsAtCompileTime)
+                   && EIGEN_IMPLIES(ColsAtCompileTime==Dynamic && MaxColsAtCompileTime!=Dynamic,cols<=MaxColsAtCompileTime)
+                   && rows>=0 && cols>=0 && "Invalid sizes when resizing a matrix or array.");
+      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(rows, cols);
      #ifdef EIGEN_INITIALIZE_COEFFS
-        Index size = nbRows*nbCols;
+        Index size = rows*cols;
        bool size_changed = size != this->size();
-        m_storage.resize(size, nbRows, nbCols);
+        m_storage.resize(size, rows, cols);
        if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
      #else
-        internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(nbRows, nbCols);
-        m_storage.resize(nbRows*nbCols, nbRows, nbCols);
+        internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(rows, cols);
+        m_storage.resize(rows*cols, rows, cols);
      #endif
    }

@@ -300,9 +305,9 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * \sa resize(Index,Index)
      */
    EIGEN_DEVICE_FUNC
-    inline void resize(NoChange_t, Index nbCols)
+    inline void resize(NoChange_t, Index cols)
    {
-      resize(rows(), nbCols);
+      resize(rows(), cols);
    }

    /** Resizes the matrix, changing only the number of rows. For the parameter of type NoChange_t, just pass the special value \c NoChange
@@ -314,9 +319,9 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * \sa resize(Index,Index)
      */
    EIGEN_DEVICE_FUNC
-    inline void resize(Index nbRows, NoChange_t)
+    inline void resize(Index rows, NoChange_t)
    {
-      resize(nbRows, cols());
+      resize(rows, cols());
    }

    /** Resizes \c *this to have the same dimensions as \a other.
@@ -356,9 +361,9 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * appended to the matrix they will be uninitialized.
      */
    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void conservativeResize(Index nbRows, Index nbCols)
+    EIGEN_STRONG_INLINE void conservativeResize(Index rows, Index cols)
    {
-      internal::conservative_resize_like_impl<Derived>::run(*this, nbRows, nbCols);
+      internal::conservative_resize_like_impl<Derived>::run(*this, rows, cols);
    }

    /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
@@ -369,10 +374,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * In case the matrix is growing, new rows will be uninitialized.
      */
    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void conservativeResize(Index nbRows, NoChange_t)
+    EIGEN_STRONG_INLINE void conservativeResize(Index rows, NoChange_t)
    {
      // Note: see the comment in conservativeResize(Index,Index)
-      conservativeResize(nbRows, cols());
+      conservativeResize(rows, cols());
    }

    /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
@@ -383,10 +388,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * In case the matrix is growing, new columns will be uninitialized.
      */
    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void conservativeResize(NoChange_t, Index nbCols)
+    EIGEN_STRONG_INLINE void conservativeResize(NoChange_t, Index cols)
    {
      // Note: see the comment in conservativeResize(Index,Index)
-      conservativeResize(rows(), nbCols);
+      conservativeResize(rows(), cols);
    }

    /** Resizes the vector to \a size while retaining old values.
@@ -479,9 +484,13 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    }
 #endif

+    /** Copy constructor */
    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE PlainObjectBase(Index a_size, Index nbRows, Index nbCols)
-      : m_storage(a_size, nbRows, nbCols)
+    EIGEN_STRONG_INLINE PlainObjectBase(const PlainObjectBase& other)
+      : Base(), m_storage(other.m_storage) { }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(Index size, Index rows, Index cols)
+      : m_storage(size, rows, cols)
    {
 //       _check_template_params();
 //       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
@@ -498,15 +507,36 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      return this->derived();
    }

-    /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
+    /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(const DenseBase<OtherDerived> &other)
+      : m_storage()
+    {
+      _check_template_params();
+      resizeLike(other);
+      _set_noalias(other);
+    }
+
+    /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE PlainObjectBase(const EigenBase<OtherDerived> &other)
-      : m_storage(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
+      : m_storage()
    {
      _check_template_params();
-      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(other.derived().rows(), other.derived().cols());
-      Base::operator=(other.derived());
+      resizeLike(other);
+      *this = other.derived();
+    }
+    /** \brief Copy constructor with in-place evaluation */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(const ReturnByValue<OtherDerived>& other)
+    {
+      _check_template_params();
+      // FIXME this does not automatically transpose vectors if necessary
+      resize(other.rows(), other.cols());
+      other.evalTo(this->derived());
    }

    /** \name Map
@@ -668,12 +698,12 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type

    template<typename T0, typename T1>
    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init2(Index nbRows, Index nbCols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
+    EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
    {
      EIGEN_STATIC_ASSERT(bool(NumTraits<T0>::IsInteger) &&
                          bool(NumTraits<T1>::IsInteger),
                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
-      resize(nbRows,nbCols);
+      resize(rows,cols);
    }
    
    template<typename T0, typename T1>
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -25,7 +25,7 @@ template<typename Lhs, typename Rhs, int Option, typename StorageKind> class Pro
  * This class represents an expression of the product of two arbitrary matrices.
  * 
  * The other template parameters are:
-  * \tparam Option     can be DefaultProduct or LazyProduct
+  * \tparam Option     can be DefaultProduct, AliasFreeProduct, or LazyProduct
  *
  */

@@ -53,6 +53,18 @@ template<typename Lhs, typename Rhs, typename LhsShape>
  typedef typename Lhs::Scalar Scalar;
 };

+template<typename Lhs, typename Rhs, typename RhsShape>
+struct product_result_scalar<Lhs, Rhs, TranspositionsShape, RhsShape>
+{
+  typedef typename Rhs::Scalar Scalar;
+};
+
+template<typename Lhs, typename Rhs, typename LhsShape>
+  struct product_result_scalar<Lhs, Rhs, LhsShape, TranspositionsShape>
+{
+  typedef typename Lhs::Scalar Scalar;
+};
+
 template<typename Lhs, typename Rhs, int Option>
 struct traits<Product<Lhs, Rhs, Option> >
 {
@@ -80,10 +92,11 @@ struct traits<Product<Lhs, Rhs, Option> >
    InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsTraits::ColsAtCompileTime, RhsTraits::RowsAtCompileTime),
    
    // The storage order is somewhat arbitrary here. The correct one will be determined through the evaluator.
-    Flags = (   (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1)
-             || ((LhsTraits::Flags&NoPreferredStorageOrderBit) && (RhsTraits::Flags&RowMajorBit))
-             || ((RhsTraits::Flags&NoPreferredStorageOrderBit) && (LhsTraits::Flags&RowMajorBit)) )
-          ? RowMajorBit : (MaxColsAtCompileTime==1 ? 0 : NoPreferredStorageOrderBit)
+    Flags = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? RowMajorBit
+          : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0
+          : (   ((LhsTraits::Flags&NoPreferredStorageOrderBit) && (RhsTraits::Flags&RowMajorBit))
+             || ((RhsTraits::Flags&NoPreferredStorageOrderBit) && (LhsTraits::Flags&RowMajorBit)) ) ? RowMajorBit
+          : NoPreferredStorageOrderBit
  };
 };

@@ -108,8 +121,8 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option,
                                                        internal::product_type<Lhs,Rhs>::ret>::ret>::Base Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(Product)

-    typedef typename internal::nested<Lhs>::type LhsNested;
-    typedef typename internal::nested<Rhs>::type RhsNested;
+    typedef typename internal::ref_selector<Lhs>::type LhsNested;
+    typedef typename internal::ref_selector<Rhs>::type RhsNested;
    typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
    typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;

@@ -152,7 +165,7 @@ public:
  
  operator const Scalar() const
  {
-    return typename internal::evaluator<ProductXpr>::type(derived()).coeff(0,0);
+    return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
  }
 };

@@ -190,7 +203,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
      EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
      eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
      
-      return typename internal::evaluator<Derived>::type(derived()).coeff(row,col);
+      return internal::evaluator<Derived>(derived()).coeff(row,col);
    }

    EIGEN_DEVICE_FUNC Scalar coeff(Index i) const
@@ -198,7 +211,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
      EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
      eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
      
-      return typename internal::evaluator<Derived>::type(derived()).coeff(i);
+      return internal::evaluator<Derived>(derived()).coeff(i);
    }
    
  
--- a/Eigen/src/Core/ProductBase.h
+++ b/Eigen/src/Core/ProductBase.h
@@ -1,27 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PRODUCTBASE_H
-#define EIGEN_PRODUCTBASE_H
-
-namespace Eigen { 
-
-/** \internal
-  * Overloaded to perform an efficient C = (A*B).lazy() */
-template<typename Derived>
-template<typename ProductDerived, typename Lhs, typename Rhs>
-Derived& MatrixBase<Derived>::lazyAssign(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-{
-  other.derived().evalTo(derived());
-  return derived();
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_PRODUCTBASE_H
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -32,9 +32,6 @@ struct evaluator<Product<Lhs, Rhs, Options> >
  typedef Product<Lhs, Rhs, Options> XprType;
  typedef product_evaluator<XprType> Base;
  
-  typedef evaluator type;
-  typedef evaluator nestedType;
-  
  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
 
@@ -47,9 +44,6 @@ struct evaluator<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,  const Produ
  typedef CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Product<Lhs, Rhs, DefaultProduct> > XprType;
  typedef evaluator<Product<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,const Lhs>, Rhs, DefaultProduct> > Base;
  
-  typedef evaluator type;
-  typedef evaluator nestedType;
-  
  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
    : Base(xpr.functor().m_other * xpr.nestedExpression().lhs() * xpr.nestedExpression().rhs())
  {}
@@ -63,9 +57,6 @@ struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> >
  typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
  typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
  
-  typedef evaluator type;
-  typedef evaluator nestedType;
-
  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
    : Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
        Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
@@ -90,18 +81,25 @@ struct evaluator_traits<Product<Lhs, Rhs, DefaultProduct> >
  enum { AssumeAliasing = 1 };
 };

+template<typename Lhs, typename Rhs>
+struct evaluator_traits<Product<Lhs, Rhs, AliasFreeProduct> > 
+ : evaluator_traits_base<Product<Lhs, Rhs, AliasFreeProduct> >
+{
+  enum { AssumeAliasing = 0 };
+};
+
 // This is the default evaluator implementation for products:
 // It creates a temporary and call generic_product_impl
-template<typename Lhs, typename Rhs, int ProductTag, typename LhsShape, typename RhsShape>
-struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, ProductTag, LhsShape, RhsShape, typename traits<Lhs>::Scalar, typename traits<Rhs>::Scalar> 
-  : public evaluator<typename Product<Lhs, Rhs, DefaultProduct>::PlainObject>::type
+template<typename Lhs, typename Rhs, int Options, int ProductTag, typename LhsShape, typename RhsShape>
+struct product_evaluator<Product<Lhs, Rhs, Options>, ProductTag, LhsShape, RhsShape, typename traits<Lhs>::Scalar, typename traits<Rhs>::Scalar,
+  EnableIf<(Options==DefaultProduct || Options==AliasFreeProduct)> >
+  : public evaluator<typename Product<Lhs, Rhs, Options>::PlainObject>
 {
-  typedef Product<Lhs, Rhs, DefaultProduct> XprType;
+  typedef Product<Lhs, Rhs, Options> XprType;
  typedef typename XprType::PlainObject PlainObject;
-  typedef typename evaluator<PlainObject>::type Base;
+  typedef evaluator<PlainObject> Base;
  enum {
    Flags = Base::Flags | EvalBeforeNestingBit
-//     CoeffReadCost = 0 // FIXME why is it needed? (this was already the case before the evaluators, see traits<ProductBase>)
  };

  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
@@ -109,7 +107,8 @@ struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, ProductTag, LhsShape
  {
    ::new (static_cast<Base*>(this)) Base(m_result);
    
-// FIXME shall we handle nested_eval here?
+// FIXME shall we handle nested_eval here?,
+// if so, then we must take care at removing the call to nested_eval in the specializations (e.g., in permutation_matrix_product, transposition_matrix_product, etc.)
 //     typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
 //     typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
 //     typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
@@ -128,10 +127,11 @@ protected:
 };

 // Dense = Product
-template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scalar>, Dense2Dense,
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct),Scalar>::type>
 {
-  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
+  typedef Product<Lhs,Rhs,Options> SrcXprType;
  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
  {
    // FIXME shall we handle nested_eval here?
@@ -140,10 +140,11 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_
 };

 // Dense += Product
-template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_assign_op<Scalar>, Dense2Dense, Scalar>
+template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<Scalar>, Dense2Dense,
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct),Scalar>::type>
 {
-  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
+  typedef Product<Lhs,Rhs,Options> SrcXprType;
  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar> &)
  {
    // FIXME shall we handle nested_eval here?
@@ -152,10 +153,11 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_ass
 };

 // Dense -= Product
-template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::sub_assign_op<Scalar>, Dense2Dense, Scalar>
+template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<Scalar>, Dense2Dense,
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct),Scalar>::type>
 {
-  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
+  typedef Product<Lhs,Rhs,Options> SrcXprType;
  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar> &)
  {
    // FIXME shall we handle nested_eval here?
@@ -210,7 +212,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
 EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
 {
-  typename evaluator<Rhs>::type rhsEval(rhs);
+  evaluator<Rhs> rhsEval(rhs);
  // FIXME make sure lhs is sequentially stored
  // FIXME not very good if rhs is real and lhs complex while alpha is real too
  // FIXME we should probably build an evaluator for dst
@@ -223,7 +225,7 @@ EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, cons
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
 EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
 {
-  typename evaluator<Lhs>::type lhsEval(lhs);
+  evaluator<Lhs> lhsEval(lhs);
  // FIXME make sure rhs is sequentially stored
  // FIXME not very good if lhs is real and rhs complex while alpha is real too
  // FIXME we should probably build an evaluator for dst
@@ -395,8 +397,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
  typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
  typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;

-  typedef typename evaluator<LhsNestedCleaned>::type LhsEtorType;
-  typedef typename evaluator<RhsNestedCleaned>::type RhsEtorType;
+  typedef evaluator<LhsNestedCleaned> LhsEtorType;
+  typedef evaluator<RhsNestedCleaned> RhsEtorType;
  
  enum {
    RowsAtCompileTime = LhsNestedCleaned::RowsAtCompileTime,
@@ -409,7 +411,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
    
    LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
    RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
-    CoeffReadCost = (InnerSize == Dynamic || LhsCoeffReadCost==Dynamic || RhsCoeffReadCost==Dynamic || NumTraits<Scalar>::AddCost==Dynamic || NumTraits<Scalar>::MulCost==Dynamic) ? Dynamic
+    CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost
+                  : (InnerSize == Dynamic || LhsCoeffReadCost==Dynamic || RhsCoeffReadCost==Dynamic || NumTraits<Scalar>::AddCost==Dynamic || NumTraits<Scalar>::MulCost==Dynamic) ? Dynamic
                  : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
                    + (InnerSize - 1) * NumTraits<Scalar>::AddCost,

@@ -418,24 +421,22 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
    LhsFlags = LhsEtorType::Flags,
    RhsFlags = RhsEtorType::Flags,
    
+    LhsAlignment = LhsEtorType::Alignment,
+    RhsAlignment = RhsEtorType::Alignment,
+    
+    LhsIsAligned = int(LhsAlignment) >= int(unpacket_traits<PacketScalar>::alignment),
+    RhsIsAligned = int(RhsAlignment) >= int(unpacket_traits<PacketScalar>::alignment),
+    
    LhsRowMajor = LhsFlags & RowMajorBit,
    RhsRowMajor = RhsFlags & RowMajorBit,
      
    SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value,

    CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
-                    && (ColsAtCompileTime == Dynamic
-                        || ( (ColsAtCompileTime % packet_traits<Scalar>::size) == 0
-                            && (RhsFlags&AlignedBit)
-                            )
-                        ),
+                    && (ColsAtCompileTime == Dynamic || ( (ColsAtCompileTime % PacketSize) == 0 && RhsIsAligned ) ),

    CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
-                    && (RowsAtCompileTime == Dynamic
-                        || ( (RowsAtCompileTime % packet_traits<Scalar>::size) == 0
-                            && (LhsFlags&AlignedBit)
-                            )
-                        ),
+                    && (RowsAtCompileTime == Dynamic || ( (RowsAtCompileTime % PacketSize) == 0 && LhsIsAligned ) ),

    EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
                    : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
@@ -443,11 +444,13 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,

    Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
          | (EvalToRowMajor ? RowMajorBit : 0)
-          | (CanVectorizeLhs ? (LhsFlags & AlignedBit) : 0)
-          | (CanVectorizeRhs ? (RhsFlags & AlignedBit) : 0)
          // TODO enable vectorization for mixed types
          | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0),
          
+    Alignment = CanVectorizeLhs ? LhsAlignment
+              : CanVectorizeRhs ? RhsAlignment
+              : 0,
+          
    /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
    * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
    * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
@@ -457,7 +460,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
                        && LhsRowMajor
                        && (!RhsRowMajor)
                        && (LhsFlags & RhsFlags & ActualPacketAccessBit)
-                        && (LhsFlags & RhsFlags & AlignedBit)
+                        && (LhsIsAligned && RhsIsAligned)
                        && (InnerSize % packet_traits<Scalar>::size == 0)
  };
  
@@ -479,13 +482,13 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
    return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
  }

-  template<int LoadMode>
-  const PacketReturnType packet(Index row, Index col) const
+  template<int LoadMode, typename PacketType>
+  const PacketType packet(Index row, Index col) const
  {
-    PacketScalar res;
+    PacketType res;
    typedef etor_product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
-                                     Unroll ? InnerSize-1 : Dynamic,
-                                     LhsEtorType, RhsEtorType, PacketScalar, LoadMode> PacketImpl;
+                                     Unroll ? InnerSize : Dynamic,
+                                     LhsEtorType, RhsEtorType, PacketType, LoadMode> PacketImpl;

    PacketImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
    return res;
@@ -527,7 +530,7 @@ struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
  {
    etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
+    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex-1)), rhs.template packet<LoadMode,Packet>(UnrollingIndex-1, col), res);
  }
 };

@@ -537,25 +540,43 @@ struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
  {
    etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res);
+    res =  pmadd(lhs.template packet<LoadMode,Packet>(row, UnrollingIndex-1), pset1<Packet>(rhs.coeff(UnrollingIndex-1, col)), res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  {
+    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode,Packet>(0, col));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  {
+    res = pmul(lhs.template packet<LoadMode,Packet>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
  }
 };

 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
  {
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
+    res = pset1<Packet>(0);
  }
 };

 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
  {
-    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
+    res = pset1<Packet>(0);
  }
 };

@@ -564,10 +585,9 @@ struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
  {
-    eigen_assert(innerDim>0 && "you are using a non initialized matrix");
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
-    for(Index i = 1; i < innerDim; ++i)
-      res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
+    res = pset1<Packet>(0);
+    for(Index i = 0; i < innerDim; ++i)
+      res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode,Packet>(i, col), res);
  }
 };

@@ -576,10 +596,9 @@ struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
  {
-    eigen_assert(innerDim>0 && "you are using a non initialized matrix");
-    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
-    for(Index i = 1; i < innerDim; ++i)
-      res =  pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
+    res = pset1<Packet>(0);
+    for(Index i = 0; i < innerDim; ++i)
+      res =  pmadd(lhs.template packet<LoadMode,Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
  }
 };

@@ -663,7 +682,6 @@ struct diagonal_product_evaluator_base
  : evaluator_base<Derived>
 {
   typedef typename scalar_product_traits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
-   typedef typename internal::packet_traits<Scalar>::type PacketScalar;
 public:
  enum {
    CoeffReadCost = NumTraits<Scalar>::MulCost + evaluator<MatrixType>::CoeffReadCost + evaluator<DiagonalType>::CoeffReadCost,
@@ -678,8 +696,8 @@ public:
    //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))),
    _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
    _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
-    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0) | AlignedBit
-            //(int(MatrixFlags)&int(DiagFlags)&AlignedBit),
+    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
+    Alignment = evaluator<MatrixType>::Alignment
  };
  
  diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
@@ -693,26 +711,26 @@ public:
  }
  
 protected:
-  template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, internal::true_type) const
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::true_type) const
  {
-    return internal::pmul(m_matImpl.template packet<LoadMode>(row, col),
-                          internal::pset1<PacketScalar>(m_diagImpl.coeff(id)));
+    return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
+                          internal::pset1<PacketType>(m_diagImpl.coeff(id)));
  }
  
-  template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, internal::false_type) const
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::false_type) const
  {
    enum {
      InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
-      DiagonalPacketLoadMode = (LoadMode == Aligned && (((InnerSize%16) == 0) || (int(DiagFlags)&AlignedBit)==AlignedBit) ? Aligned : Unaligned)
+      DiagonalPacketLoadMode = EIGEN_PLAIN_ENUM_MIN(LoadMode,((InnerSize%16) == 0) ? int(Aligned16) : int(evaluator<DiagonalType>::Alignment)) // FIXME hardcoded 16!!
    };
-    return internal::pmul(m_matImpl.template packet<LoadMode>(row, col),
-                          m_diagImpl.template packet<DiagonalPacketLoadMode>(id));
+    return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
+                          m_diagImpl.template packet<DiagonalPacketLoadMode,PacketType>(id));
  }
  
-  typename evaluator<DiagonalType>::nestedType m_diagImpl;
-  typename evaluator<MatrixType>::nestedType   m_matImpl;
+  evaluator<DiagonalType> m_diagImpl;
+  evaluator<MatrixType>   m_matImpl;
 };

 // diagonal * dense
@@ -724,9 +742,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
  using Base::m_diagImpl;
  using Base::m_matImpl;
  using Base::coeff;
-  using Base::packet_impl;
  typedef typename Base::Scalar Scalar;
-  typedef typename Base::PacketScalar PacketScalar;
  
  typedef Product<Lhs, Rhs, ProductKind> XprType;
  typedef typename XprType::PlainObject PlainObject;
@@ -746,18 +762,19 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
  }
  
 #ifndef __CUDACC__
-  template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketScalar packet(Index row, Index col) const
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
  {
-    // NVCC complains about template keyword, so we disable this function in CUDA mode
-    return this->template packet_impl<LoadMode>(row,col, row,
+    // FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case.
+    // See also similar calls below.
+    return this->template packet_impl<LoadMode,PacketType>(row,col, row,
                                 typename internal::conditional<int(StorageOrder)==RowMajor, internal::true_type, internal::false_type>::type());
  }
  
-  template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketScalar packet(Index idx) const
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index idx) const
  {
-    return packet<LoadMode>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
+    return packet<LoadMode,PacketType>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
  }
 #endif
 };
@@ -771,9 +788,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
  using Base::m_diagImpl;
  using Base::m_matImpl;
  using Base::coeff;
-  using Base::packet_impl;
  typedef typename Base::Scalar Scalar;
-  typedef typename Base::PacketScalar PacketScalar;
  
  typedef Product<Lhs, Rhs, ProductKind> XprType;
  typedef typename XprType::PlainObject PlainObject;
@@ -791,17 +806,17 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
  }
  
 #ifndef __CUDACC__
-  template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketScalar packet(Index row, Index col) const
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
  {
-    return this->template packet_impl<LoadMode>(row,col, col,
+    return this->template packet_impl<LoadMode,PacketType>(row,col, col,
                                 typename internal::conditional<int(StorageOrder)==ColMajor, internal::true_type, internal::false_type>::type());
  }
  
-  template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketScalar packet(Index idx) const
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index idx) const
  {
-    return packet<LoadMode>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
+    return packet<LoadMode,PacketType>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
  }
 #endif
 };
@@ -809,48 +824,187 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
 /***************************************************************************
 * Products with permutation matrices
 ***************************************************************************/
-  
-template<typename Lhs, typename Rhs, int ProductTag>
-struct generic_product_impl<Lhs, Rhs, PermutationShape, DenseShape, ProductTag>
+
+/** \internal
+  * \class permutation_matrix_product
+  * Internal helper class implementing the product between a permutation matrix and a matrix.
+  * This class is specialized for DenseShape below and for SparseShape in SparseCore/SparsePermutation.h
+  */
+template<typename ExpressionType, int Side, bool Transposed, typename ExpressionShape>
+struct permutation_matrix_product;
+
+template<typename ExpressionType, int Side, bool Transposed>
+struct permutation_matrix_product<ExpressionType, Side, Transposed, DenseShape>
+{
+    typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
+    typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
+
+    template<typename Dest, typename PermutationType>
+    static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
+    {
+      MatrixType mat(xpr);
+      const Index n = Side==OnTheLeft ? mat.rows() : mat.cols();
+      // FIXME we need an is_same for expression that is not sensitive to constness. For instance
+      // is_same_xpr<Block<const Matrix>, Block<Matrix> >::value should be true.
+      //if(is_same<MatrixTypeCleaned,Dest>::value && extract_data(dst) == extract_data(mat))
+      if(is_same_dense(dst, mat))
+      {
+        // apply the permutation inplace
+        Matrix<bool,PermutationType::RowsAtCompileTime,1,0,PermutationType::MaxRowsAtCompileTime> mask(perm.size());
+        mask.fill(false);
+        Index r = 0;
+        while(r < perm.size())
+        {
+          // search for the next seed
+          while(r<perm.size() && mask[r]) r++;
+          if(r>=perm.size())
+            break;
+          // we got one, let's follow it until we are back to the seed
+          Index k0 = r++;
+          Index kPrev = k0;
+          mask.coeffRef(k0) = true;
+          for(Index k=perm.indices().coeff(k0); k!=k0; k=perm.indices().coeff(k))
+          {
+                  Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>(dst, k)
+            .swap(Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
+                       (dst,((Side==OnTheLeft) ^ Transposed) ? k0 : kPrev));
+
+            mask.coeffRef(k) = true;
+            kPrev = k;
+          }
+        }
+      }
+      else
+      {
+        for(Index i = 0; i < n; ++i)
+        {
+          Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
+               (dst, ((Side==OnTheLeft) ^ Transposed) ? perm.indices().coeff(i) : i)
+
+          =
+
+          Block<const MatrixTypeCleaned,Side==OnTheLeft ? 1 : MatrixTypeCleaned::RowsAtCompileTime,Side==OnTheRight ? 1 : MatrixTypeCleaned::ColsAtCompileTime>
+               (mat, ((Side==OnTheRight) ^ Transposed) ? perm.indices().coeff(i) : i);
+        }
+      }
+    }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, PermutationShape, MatrixShape, ProductTag>
 {
  template<typename Dest>
  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
  {
-    permut_matrix_product_retval<Lhs, Rhs, OnTheLeft, false> pmpr(lhs, rhs);
-    pmpr.evalTo(dst);
+    permutation_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
  }
 };

-template<typename Lhs, typename Rhs, int ProductTag>
-struct generic_product_impl<Lhs, Rhs, DenseShape, PermutationShape, ProductTag>
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, MatrixShape, PermutationShape, ProductTag>
 {
  template<typename Dest>
  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
  {
-    permut_matrix_product_retval<Rhs, Lhs, OnTheRight, false> pmpr(rhs, lhs);
-    pmpr.evalTo(dst);
+    permutation_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
  }
 };

-template<typename Lhs, typename Rhs, int ProductTag>
-struct generic_product_impl<Transpose<Lhs>, Rhs, PermutationShape, DenseShape, ProductTag>
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Transpose<Lhs>, Rhs, PermutationShape, MatrixShape, ProductTag>
 {
  template<typename Dest>
  static void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
  {
-    permut_matrix_product_retval<Lhs, Rhs, OnTheLeft, true> pmpr(lhs.nestedPermutation(), rhs);
-    pmpr.evalTo(dst);
+    permutation_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
  }
 };

-template<typename Lhs, typename Rhs, int ProductTag>
-struct generic_product_impl<Lhs, Transpose<Rhs>, DenseShape, PermutationShape, ProductTag>
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Transpose<Rhs>, MatrixShape, PermutationShape, ProductTag>
 {
  template<typename Dest>
  static void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
  {
-    permut_matrix_product_retval<Rhs, Lhs, OnTheRight, true> pmpr(rhs.nestedPermutation(), lhs);
-    pmpr.evalTo(dst);
+    permutation_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
+  }
+};
+
+
+/***************************************************************************
+* Products with transpositions matrices
+***************************************************************************/
+
+// FIXME could we unify Transpositions and Permutation into a single "shape"??
+
+/** \internal
+  * \class transposition_matrix_product
+  * Internal helper class implementing the product between a permutation matrix and a matrix.
+  */
+template<typename ExpressionType, int Side, bool Transposed, typename ExpressionShape>
+struct transposition_matrix_product
+{
+  typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
+  typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
+  
+  template<typename Dest, typename TranspositionType>
+  static inline void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr)
+  {
+    MatrixType mat(xpr);
+    typedef typename TranspositionType::StorageIndex StorageIndex;
+    const Index size = tr.size();
+    StorageIndex j = 0;
+
+    if(!(is_same<MatrixTypeCleaned,Dest>::value && extract_data(dst) == extract_data(mat)))
+      dst = mat;
+
+    for(Index k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k)
+      if(Index(j=tr.coeff(k))!=k)
+      {
+        if(Side==OnTheLeft)        dst.row(k).swap(dst.row(j));
+        else if(Side==OnTheRight)  dst.col(k).swap(dst.col(j));
+      }
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, TranspositionsShape, MatrixShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    transposition_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, MatrixShape, TranspositionsShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    transposition_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
+  }
+};
+
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Transpose<Lhs>, Rhs, TranspositionsShape, MatrixShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
+  {
+    transposition_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Transpose<Rhs>, MatrixShape, TranspositionsShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
+  {
+    transposition_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
  }
 };

--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h
@@ -53,7 +53,7 @@ struct functor_traits<scalar_random_op<Scalar> >
  * \sa DenseBase::setRandom(), DenseBase::Random(Index), DenseBase::Random()
  */
 template<typename Derived>
-inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
+inline const typename DenseBase<Derived>::RandomReturnType
 DenseBase<Derived>::Random(Index rows, Index cols)
 {
  return NullaryExpr(rows, cols, internal::scalar_random_op<Scalar>());
@@ -84,7 +84,7 @@ DenseBase<Derived>::Random(Index rows, Index cols)
  * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random()
  */
 template<typename Derived>
-inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
+inline const typename DenseBase<Derived>::RandomReturnType
 DenseBase<Derived>::Random(Index size)
 {
  return NullaryExpr(size, internal::scalar_random_op<Scalar>());
@@ -110,7 +110,7 @@ DenseBase<Derived>::Random(Index size)
  * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random(Index)
  */
 template<typename Derived>
-inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
+inline const typename DenseBase<Derived>::RandomReturnType
 DenseBase<Derived>::Random()
 {
  return NullaryExpr(RowsAtCompileTime, ColsAtCompileTime, internal::scalar_random_op<Scalar>());
@@ -162,8 +162,8 @@ PlainObjectBase<Derived>::setRandom(Index newSize)
  *
  * \not_reentrant
  * 
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
  *
  * Example: \include Matrix_setRandom_int_int.cpp
  * Output: \verbinclude Matrix_setRandom_int_int.out
@@ -172,9 +172,9 @@ PlainObjectBase<Derived>::setRandom(Index newSize)
  */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setRandom(Index nbRows, Index nbCols)
+PlainObjectBase<Derived>::setRandom(Index rows, Index cols)
 {
-  resize(nbRows, nbCols);
+  resize(rows, cols);
  return setRandom();
 }

--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -165,7 +165,7 @@ struct redux_vec_unroller<Func, Derived, Start, 1>
    index = Start * packet_traits<typename Derived::Scalar>::size,
    outer = index / int(Derived::InnerSizeAtCompileTime),
    inner = index % int(Derived::InnerSizeAtCompileTime),
-    alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned
+    alignment = Derived::Alignment
  };

  typedef typename Derived::Scalar Scalar;
@@ -173,7 +173,7 @@ struct redux_vec_unroller<Func, Derived, Start, 1>

  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&)
  {
-    return mat.template packetByOuterInner<alignment>(outer, inner);
+    return mat.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
  }
 };

@@ -222,11 +222,12 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
    const Index size = mat.size();
    
    const Index packetSize = packet_traits<Scalar>::size;
-    const Index alignedStart = internal::first_aligned(mat);
+    const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
    enum {
-      alignment = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) || bool(Derived::Flags & AlignedBit)
-                ? Aligned : Unaligned
+      alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
+      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment)
    };
+    const Index alignedStart = internal::first_default_aligned(mat.nestedExpression());
    const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
    const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
    const Index alignedEnd2 = alignedStart + alignedSize2;
@@ -234,19 +235,19 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
    Scalar res;
    if(alignedSize)
    {
-      PacketScalar packet_res0 = mat.template packet<alignment>(alignedStart);
+      PacketScalar packet_res0 = mat.template packet<alignment,PacketScalar>(alignedStart);
      if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
      {
-        PacketScalar packet_res1 = mat.template packet<alignment>(alignedStart+packetSize);
+        PacketScalar packet_res1 = mat.template packet<alignment,PacketScalar>(alignedStart+packetSize);
        for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
        {
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment>(index));
-          packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment>(index+packetSize));
+          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(index));
+          packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment,PacketScalar>(index+packetSize));
        }

        packet_res0 = func.packetOp(packet_res0,packet_res1);
        if(alignedEnd>alignedEnd2)
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment>(alignedEnd2));
+          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(alignedEnd2));
      }
      res = func.predux(packet_res0);

@@ -272,7 +273,7 @@ template<typename Func, typename Derived>
 struct redux_impl<Func, Derived, SliceVectorizedTraversal, NoUnrolling>
 {
  typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
+  typedef typename packet_traits<Scalar>::type PacketType;

  EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func)
  {
@@ -286,10 +287,10 @@ struct redux_impl<Func, Derived, SliceVectorizedTraversal, NoUnrolling>
    Scalar res;
    if(packetedInnerSize)
    {
-      PacketScalar packet_res = mat.template packet<Unaligned>(0,0);
+      PacketType packet_res = mat.template packet<Unaligned,PacketType>(0,0);
      for(Index j=0; j<outerSize; ++j)
        for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize))
-          packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned>(j,i));
+          packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned,PacketType>(j,i));

      res = func.predux(packet_res);
      for(Index j=0; j<outerSize; ++j)
@@ -352,7 +353,8 @@ public:
    IsRowMajor = XprType::IsRowMajor,
    SizeAtCompileTime = XprType::SizeAtCompileTime,
    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime,
-    CoeffReadCost = evaluator<XprType>::CoeffReadCost
+    CoeffReadCost = evaluator<XprType>::CoeffReadCost,
+    Alignment = evaluator<XprType>::Alignment
  };
  
  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
@@ -369,24 +371,26 @@ public:
  CoeffReturnType coeff(Index index) const
  { return m_evaluator.coeff(index); }

-  template<int LoadMode>
+  template<int LoadMode, typename PacketType>
  PacketReturnType packet(Index row, Index col) const
-  { return m_evaluator.template packet<LoadMode>(row, col); }
+  { return m_evaluator.template packet<LoadMode,PacketType>(row, col); }

-  template<int LoadMode>
+  template<int LoadMode, typename PacketType>
  PacketReturnType packet(Index index) const
-  { return m_evaluator.template packet<LoadMode>(index); }
+  { return m_evaluator.template packet<LoadMode,PacketType>(index); }
  
  EIGEN_DEVICE_FUNC
  CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
  { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
  
-  template<int LoadMode>
+  template<int LoadMode, typename PacketType>
  PacketReturnType packetByOuterInner(Index outer, Index inner) const
-  { return m_evaluator.template packet<LoadMode>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  { return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  
+  const XprType & nestedExpression() const { return m_xpr; }
  
 protected:
-  typename internal::evaluator<XprType>::nestedType m_evaluator;
+  internal::evaluator<XprType> m_evaluator;
  const XprType &m_xpr;
 };

@@ -406,7 +410,7 @@ protected:
  */
 template<typename Derived>
 template<typename Func>
-EIGEN_STRONG_INLINE typename internal::result_of<Func(typename internal::traits<Derived>::Scalar)>::type
+typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::redux(const Func& func) const
 {
  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -18,7 +18,7 @@ namespace Eigen {
  * \brief A matrix or vector expression mapping an existing expression
  *
  * \tparam PlainObjectType the equivalent matrix type of the mapped data
-  * \tparam Options specifies whether the pointer is \c #Aligned, or \c #Unaligned.
+  * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
  *                The default is \c #Unaligned.
  * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1),
  *                   but accepts a variable outer stride (leading dimension).
@@ -48,8 +48,9 @@ namespace Eigen {
  * VectorXf a;
  * foo1(a.head());             // OK
  * foo1(A.col());              // OK
-  * foo1(A.row());              // compilation error because here innerstride!=1
-  * foo2(A.row());              // The row is copied into a contiguous temporary
+  * foo1(A.row());              // Compilation error because here innerstride!=1
+  * foo2(A.row());              // Compilation error because A.row() is a 1xN object while foo2 is expecting a Nx1 object
+  * foo2(A.row().transpose());  // The row is copied into a contiguous temporary
  * foo2(2*a);                  // The expression is evaluated into a temporary
  * foo2(A.col().segment(2,4)); // No temporary
  * \endcode
@@ -91,7 +92,8 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
  typedef _StrideType StrideType;
  enum {
    Options = _Options,
-    Flags = traits<Map<_PlainObjectType, _Options, _StrideType> >::Flags | NestByRefBit
+    Flags = traits<Map<_PlainObjectType, _Options, _StrideType> >::Flags | NestByRefBit,
+    Alignment = traits<Map<_PlainObjectType, _Options, _StrideType> >::Alignment
  };

  template<typename Derived> struct match {
@@ -103,8 +105,9 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
                      || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
      OuterStrideMatch = Derived::IsVectorAtCompileTime
                      || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
-      AlignmentMatch = (_Options!=Aligned) || ((PlainObjectType::Flags&AlignedBit)==0) || ((traits<Derived>::Flags&AlignedBit)==AlignedBit),
-      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch
+      AlignmentMatch = (int(traits<PlainObjectType>::Alignment)==int(Unaligned)) || (int(evaluator<Derived>::Alignment) >= int(Alignment)), // FIXME the first condition is not very clear, it should be replaced by the required alignment
+      ScalarTypeMatch = internal::is_same<typename PlainObjectType::Scalar, typename Derived::Scalar>::value,
+      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch && ScalarTypeMatch
    };
    typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
  };
@@ -183,9 +186,11 @@ protected:
 template<typename PlainObjectType, int Options, typename StrideType> class Ref
  : public RefBase<Ref<PlainObjectType, Options, StrideType> >
 {
+  private:
    typedef internal::traits<Ref> Traits;
    template<typename Derived>
-    EIGEN_DEVICE_FUNC inline Ref(const PlainObjectBase<Derived>& expr);
+    EIGEN_DEVICE_FUNC inline Ref(const PlainObjectBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0);
  public:

    typedef RefBase<Ref> Base;
@@ -194,13 +199,15 @@ template<typename PlainObjectType, int Options, typename StrideType> class Ref

    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename Derived>
-    EIGEN_DEVICE_FUNC inline Ref(PlainObjectBase<Derived>& expr)
+    EIGEN_DEVICE_FUNC inline Ref(PlainObjectBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
    {
      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
      Base::construct(expr.derived());
    }
    template<typename Derived>
-    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr)
+    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
    #else
    template<typename Derived>
    inline Ref(DenseBase<Derived>& expr)
@@ -227,7 +234,8 @@ template<typename TPlainObjectType, int Options, typename StrideType> class Ref<
    EIGEN_DENSE_PUBLIC_INTERFACE(Ref)

    template<typename Derived>
-    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr)
+    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::ScalarTypeMatch),Derived>::type* = 0)
    {
 //      std::cout << match_helper<Derived>::HasDirectAccess << "," << match_helper<Derived>::OuterStrideMatch << "," << match_helper<Derived>::InnerStrideMatch << "\n";
 //      std::cout << int(StrideType::OuterStrideAtCompileTime) << " - " << int(Derived::OuterStrideAtCompileTime) << "\n";
--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h
@@ -35,10 +35,7 @@ struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
  typedef typename MatrixType::Scalar Scalar;
  typedef typename traits<MatrixType>::StorageKind StorageKind;
  typedef typename traits<MatrixType>::XprKind XprKind;
-  enum {
-    Factor = (RowFactor==Dynamic || ColFactor==Dynamic) ? Dynamic : RowFactor*ColFactor
-  };
-  typedef typename nested<MatrixType,Factor>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
  typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
  enum {
    RowsAtCompileTime = RowFactor==Dynamic || int(MatrixType::RowsAtCompileTime)==Dynamic
@@ -72,8 +69,9 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
    typedef typename internal::remove_all<MatrixType>::type NestedExpression;

    template<typename OriginalMatrixType>
-    inline explicit Replicate(const OriginalMatrixType& a_matrix)
-      : m_matrix(a_matrix), m_rowFactor(RowFactor), m_colFactor(ColFactor)
+    EIGEN_DEVICE_FUNC
+    inline explicit Replicate(const OriginalMatrixType& matrix)
+      : m_matrix(matrix), m_rowFactor(RowFactor), m_colFactor(ColFactor)
    {
      EIGEN_STATIC_ASSERT((internal::is_same<typename internal::remove_const<MatrixType>::type,OriginalMatrixType>::value),
                          THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
@@ -81,41 +79,20 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
    }

    template<typename OriginalMatrixType>
-    inline Replicate(const OriginalMatrixType& a_matrix, Index rowFactor, Index colFactor)
-      : m_matrix(a_matrix), m_rowFactor(rowFactor), m_colFactor(colFactor)
+    EIGEN_DEVICE_FUNC
+    inline Replicate(const OriginalMatrixType& matrix, Index rowFactor, Index colFactor)
+      : m_matrix(matrix), m_rowFactor(rowFactor), m_colFactor(colFactor)
    {
      EIGEN_STATIC_ASSERT((internal::is_same<typename internal::remove_const<MatrixType>::type,OriginalMatrixType>::value),
                          THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
    }

+    EIGEN_DEVICE_FUNC
    inline Index rows() const { return m_matrix.rows() * m_rowFactor.value(); }
+    EIGEN_DEVICE_FUNC
    inline Index cols() const { return m_matrix.cols() * m_colFactor.value(); }

-    inline Scalar coeff(Index rowId, Index colId) const
-    {
-      // try to avoid using modulo; this is a pure optimization strategy
-      const Index actual_row  = internal::traits<MatrixType>::RowsAtCompileTime==1 ? 0
-                            : RowFactor==1 ? rowId
-                            : rowId%m_matrix.rows();
-      const Index actual_col  = internal::traits<MatrixType>::ColsAtCompileTime==1 ? 0
-                            : ColFactor==1 ? colId
-                            : colId%m_matrix.cols();
-
-      return m_matrix.coeff(actual_row, actual_col);
-    }
-    template<int LoadMode>
-    inline PacketScalar packet(Index rowId, Index colId) const
-    {
-      const Index actual_row  = internal::traits<MatrixType>::RowsAtCompileTime==1 ? 0
-                            : RowFactor==1 ? rowId
-                            : rowId%m_matrix.rows();
-      const Index actual_col  = internal::traits<MatrixType>::ColsAtCompileTime==1 ? 0
-                            : ColFactor==1 ? colId
-                            : colId%m_matrix.cols();
-
-      return m_matrix.template packet<LoadMode>(actual_row, actual_col);
-    }
-
+    EIGEN_DEVICE_FUNC
    const _MatrixTypeNested& nestedExpression() const
    { 
      return m_matrix; 
@@ -137,27 +114,12 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
  */
 template<typename Derived>
 template<int RowFactor, int ColFactor>
-inline const Replicate<Derived,RowFactor,ColFactor>
+const Replicate<Derived,RowFactor,ColFactor>
 DenseBase<Derived>::replicate() const
 {
  return Replicate<Derived,RowFactor,ColFactor>(derived());
 }

-/**
-  * \return an expression of the replication of \c *this
-  *
-  * Example: \include MatrixBase_replicate_int_int.cpp
-  * Output: \verbinclude MatrixBase_replicate_int_int.out
-  *
-  * \sa VectorwiseOp::replicate(), DenseBase::replicate<int,int>(), class Replicate
-  */
-template<typename Derived>
-inline const Replicate<Derived,Dynamic,Dynamic>
-DenseBase<Derived>::replicate(Index rowFactor,Index colFactor) const
-{
-  return Replicate<Derived,Dynamic,Dynamic>(derived(),rowFactor,colFactor);
-}
-
 /**
  * \return an expression of the replication of each column (or row) of \c *this
  *
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@@ -94,15 +94,12 @@ namespace internal {
  
 template<typename Derived>
 struct evaluator<ReturnByValue<Derived> >
-  : public evaluator<typename internal::traits<Derived>::ReturnType>::type
+  : public evaluator<typename internal::traits<Derived>::ReturnType>
 {
  typedef ReturnByValue<Derived> XprType;
  typedef typename internal::traits<Derived>::ReturnType PlainObject;
-  typedef typename evaluator<PlainObject>::type Base;
+  typedef evaluator<PlainObject> Base;
  
-  typedef evaluator type;
-  typedef evaluator nestedType;
-
  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
    : m_result(xpr.rows(), xpr.cols())
  {
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -37,7 +37,7 @@ struct traits<Reverse<MatrixType, Direction> >
  typedef typename MatrixType::Scalar Scalar;
  typedef typename traits<MatrixType>::StorageKind StorageKind;
  typedef typename traits<MatrixType>::XprKind XprKind;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
  typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
  enum {
    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
@@ -48,14 +48,14 @@ struct traits<Reverse<MatrixType, Direction> >
  };
 };

-template<typename PacketScalar, bool ReversePacket> struct reverse_packet_cond
+template<typename PacketType, bool ReversePacket> struct reverse_packet_cond
 {
-  static inline PacketScalar run(const PacketScalar& x) { return preverse(x); }
+  static inline PacketType run(const PacketType& x) { return preverse(x); }
 };

-template<typename PacketScalar> struct reverse_packet_cond<PacketScalar,false>
+template<typename PacketType> struct reverse_packet_cond<PacketType,false>
 {
-  static inline PacketScalar run(const PacketScalar& x) { return x; }
+  static inline PacketType run(const PacketType& x) { return x; }
 };

 } // end namespace internal 
@@ -70,10 +70,6 @@ template<typename MatrixType, int Direction> class Reverse
    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
    using Base::IsRowMajor;

-    // next line is necessary because otherwise const version of operator()
-    // is hidden by non-const version defined in this file
-    using Base::operator(); 
-
  protected:
    enum {
      PacketSize = internal::packet_traits<Scalar>::size,
@@ -101,69 +97,6 @@ template<typename MatrixType, int Direction> class Reverse
      return -m_matrix.innerStride();
    }

-    EIGEN_DEVICE_FUNC inline Scalar& operator()(Index row, Index col)
-    {
-      eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
-      return coeffRef(row, col);
-    }
-
-    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
-    {
-      return m_matrix.const_cast_derived().coeffRef(ReverseRow ? m_matrix.rows() - row - 1 : row,
-                                                    ReverseCol ? m_matrix.cols() - col - 1 : col);
-    }
-
-    EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index row, Index col) const
-    {
-      return m_matrix.coeff(ReverseRow ? m_matrix.rows() - row - 1 : row,
-                            ReverseCol ? m_matrix.cols() - col - 1 : col);
-    }
-
-    EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index index) const
-    {
-      return m_matrix.coeff(m_matrix.size() - index - 1);
-    }
-
-    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
-    {
-      return m_matrix.const_cast_derived().coeffRef(m_matrix.size() - index - 1);
-    }
-
-    EIGEN_DEVICE_FUNC inline Scalar& operator()(Index index)
-    {
-      eigen_assert(index >= 0 && index < m_matrix.size());
-      return coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index row, Index col) const
-    {
-      return reverse_packet::run(m_matrix.template packet<LoadMode>(
-                                    ReverseRow ? m_matrix.rows() - row - OffsetRow : row,
-                                    ReverseCol ? m_matrix.cols() - col - OffsetCol : col));
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index row, Index col, const PacketScalar& x)
-    {
-      m_matrix.const_cast_derived().template writePacket<LoadMode>(
-                                      ReverseRow ? m_matrix.rows() - row - OffsetRow : row,
-                                      ReverseCol ? m_matrix.cols() - col - OffsetCol : col,
-                                      reverse_packet::run(x));
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return internal::preverse(m_matrix.template packet<LoadMode>( m_matrix.size() - index - PacketSize ));
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
-    {
-      m_matrix.const_cast_derived().template writePacket<LoadMode>(m_matrix.size() - index - PacketSize, internal::preverse(x));
-    }
-
    EIGEN_DEVICE_FUNC const typename internal::remove_all<typename MatrixType::Nested>::type&
    nestedExpression() const 
    {
@@ -187,30 +120,90 @@ DenseBase<Derived>::reverse()
  return ReverseReturnType(derived());
 }

-/** This is the const version of reverse(). */
-template<typename Derived>
-inline const typename DenseBase<Derived>::ConstReverseReturnType
-DenseBase<Derived>::reverse() const
-{
-  return ConstReverseReturnType(derived());
-}
+
+//reverse const overload moved DenseBase.h due to a CUDA compiler bug

 /** This is the "in place" version of reverse: it reverses \c *this.
  *
  * In most cases it is probably better to simply use the reversed expression
  * of a matrix. However, when reversing the matrix data itself is really needed,
  * then this "in-place" version is probably the right choice because it provides
-  * the following additional features:
+  * the following additional benefits:
  *  - less error prone: doing the same operation with .reverse() requires special care:
  *    \code m = m.reverse().eval(); \endcode
-  *  - this API allows to avoid creating a temporary (the current implementation creates a temporary, but that could be avoided using swap)
+  *  - this API enables reverse operations without the need for a temporary
  *  - it allows future optimizations (cache friendliness, etc.)
  *
-  * \sa reverse() */
+  * \sa VectorwiseOp::reverseInPlace(), reverse() */
 template<typename Derived>
 inline void DenseBase<Derived>::reverseInPlace()
 {
-  derived() = derived().reverse().eval();
+  if(cols()>rows())
+  {
+    Index half = cols()/2;
+    leftCols(half).swap(rightCols(half).reverse());
+    if((cols()%2)==1)
+    {
+      Index half2 = rows()/2;
+      col(half).head(half2).swap(col(half).tail(half2).reverse());
+    }
+  }
+  else
+  {
+    Index half = rows()/2;
+    topRows(half).swap(bottomRows(half).reverse());
+    if((rows()%2)==1)
+    {
+      Index half2 = cols()/2;
+      row(half).head(half2).swap(row(half).tail(half2).reverse());
+    }
+  }
+}
+
+namespace internal {
+  
+template<int Direction>
+struct vectorwise_reverse_inplace_impl;
+
+template<>
+struct vectorwise_reverse_inplace_impl<Vertical>
+{
+  template<typename ExpressionType>
+  static void run(ExpressionType &xpr)
+  {
+    Index half = xpr.rows()/2;
+    xpr.topRows(half).swap(xpr.bottomRows(half).colwise().reverse());
+  }
+};
+
+template<>
+struct vectorwise_reverse_inplace_impl<Horizontal>
+{
+  template<typename ExpressionType>
+  static void run(ExpressionType &xpr)
+  {
+    Index half = xpr.cols()/2;
+    xpr.leftCols(half).swap(xpr.rightCols(half).rowwise().reverse());
+  }
+};
+
+} // end namespace internal
+
+/** This is the "in place" version of VectorwiseOp::reverse: it reverses each column or row of \c *this.
+  *
+  * In most cases it is probably better to simply use the reversed expression
+  * of a matrix. However, when reversing the matrix data itself is really needed,
+  * then this "in-place" version is probably the right choice because it provides
+  * the following additional benefits:
+  *  - less error prone: doing the same operation with .reverse() requires special care:
+  *    \code m = m.reverse().eval(); \endcode
+  *  - this API enables reverse operations without the need for a temporary
+  *
+  * \sa DenseBase::reverseInPlace(), reverse() */
+template<typename ExpressionType, int Direction>
+void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
+{
+  internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -32,7 +32,7 @@ namespace internal {
 template<typename MatrixType, unsigned int UpLo>
 struct traits<SelfAdjointView<MatrixType, UpLo> > : traits<MatrixType>
 {
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
  typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
  typedef MatrixType ExpressionType;
  typedef typename MatrixType::PlainObject FullMatrixType;
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@@ -113,15 +113,12 @@ namespace internal {
 // Evaluator of Solve -> eval into a temporary
 template<typename Decomposition, typename RhsType>
 struct evaluator<Solve<Decomposition,RhsType> >
-  : public evaluator<typename Solve<Decomposition,RhsType>::PlainObject>::type
+  : public evaluator<typename Solve<Decomposition,RhsType>::PlainObject>
 {
  typedef Solve<Decomposition,RhsType> SolveType;
  typedef typename SolveType::PlainObject PlainObject;
-  typedef typename evaluator<PlainObject>::type Base;
+  typedef evaluator<PlainObject> Base;
  
-  typedef evaluator type;
-  typedef evaluator nestedType;
-
  EIGEN_DEVICE_FUNC explicit evaluator(const SolveType& solve)
    : m_result(solve.rows(), solve.cols())
  {
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -198,8 +198,8 @@ void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<Ot
  * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
  * is an upper (resp. lower) triangular matrix.
  *
-  * Example: \include MatrixBase_marked.cpp
-  * Output: \verbinclude MatrixBase_marked.out
+  * Example: \include Triangular_solve.cpp
+  * Output: \verbinclude Triangular_solve.out
  *
  * This function returns an expression of the inverse-multiply and can works in-place if it is assigned
  * to the same matrix or vector \a other.
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -157,19 +157,32 @@ inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::stableNorm() const
 {
  using std::sqrt;
+  using std::abs;
  const Index blockSize = 4096;
  RealScalar scale(0);
  RealScalar invScale(1);
  RealScalar ssq(0); // sum of square
+  
+  typedef typename internal::nested_eval<Derived,2>::type DerivedCopy;
+  typedef typename internal::remove_all<DerivedCopy>::type DerivedCopyClean;
+  DerivedCopy copy(derived());
+  
  enum {
-    Alignment = (int(Flags)&DirectAccessBit) || (int(Flags)&AlignedBit) ? 1 : 0
+    CanAlign = (int(Flags)&DirectAccessBit) || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME
  };
+  typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
+                                                   typename DerivedCopyClean
+                                                   ::ConstSegmentReturnType>::type SegmentWrapper;
  Index n = size();
-  Index bi = internal::first_aligned(derived());
+  
+  if(n==1)
+    return abs(this->coeff(0));
+  
+  Index bi = internal::first_default_aligned(copy);
  if (bi>0)
-    internal::stable_norm_kernel(this->head(bi), ssq, scale, invScale);
+    internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
  for (; bi<n; bi+=blockSize)
-    internal::stable_norm_kernel(this->segment(bi,numext::mini(blockSize, n - bi)).template forceAlignedAccessIf<Alignment>(), ssq, scale, invScale);
+    internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
  return scale * sqrt(ssq);
 }

--- a/Eigen/src/Core/Swap.h
+++ b/Eigen/src/Core/Swap.h
@@ -21,7 +21,6 @@ class generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap
 {
 protected:
  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, BuiltIn> Base;
-  typedef typename DstEvaluatorTypeT::PacketScalar PacketScalar;
  using Base::m_dst;
  using Base::m_src;
  using Base::m_functor;
@@ -35,25 +34,29 @@ public:
    : Base(dst, src, func, dstExpr)
  {}
  
-  template<int StoreMode, int LoadMode>
+  template<int StoreMode, int LoadMode, typename PacketType>
  void assignPacket(Index row, Index col)
  {
-    m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(row,col), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(row,col));
+    PacketType tmp = m_src.template packet<LoadMode,PacketType>(row,col);
+    const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(row,col, m_dst.template packet<StoreMode,PacketType>(row,col));
+    m_dst.template writePacket<StoreMode>(row,col,tmp);
  }
  
-  template<int StoreMode, int LoadMode>
+  template<int StoreMode, int LoadMode, typename PacketType>
  void assignPacket(Index index)
  {
-    m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(index), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(index));
+    PacketType tmp = m_src.template packet<LoadMode,PacketType>(index);
+    const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(index, m_dst.template packet<StoreMode,PacketType>(index));
+    m_dst.template writePacket<StoreMode>(index,tmp);
  }
  
  // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael)
-  template<int StoreMode, int LoadMode>
+  template<int StoreMode, int LoadMode, typename PacketType>
  void assignPacketByOuterInner(Index outer, Index inner)
  {
    Index row = Base::rowIndexByOuterInner(outer, inner); 
    Index col = Base::colIndexByOuterInner(outer, inner);
-    assignPacket<StoreMode,LoadMode>(row, col);
+    assignPacket<StoreMode,LoadMode,PacketType>(row, col);
  }
 };

--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -31,7 +31,7 @@ namespace internal {
 template<typename MatrixType>
 struct traits<Transpose<MatrixType> > : public traits<MatrixType>
 {
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
  typedef typename remove_reference<MatrixTypeNested>::type MatrixTypeNestedPlain;
  enum {
    RowsAtCompileTime = MatrixType::ColsAtCompileTime,
@@ -60,7 +60,7 @@ template<typename MatrixType> class Transpose
    typedef typename internal::remove_all<MatrixType>::type NestedExpression;

    EIGEN_DEVICE_FUNC
-    explicit inline Transpose(MatrixType& a_matrix) : m_matrix(a_matrix) {}
+    explicit inline Transpose(MatrixType& matrix) : m_matrix(matrix) {}

    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)

@@ -233,7 +233,7 @@ struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x Packet
    typedef typename MatrixType::Scalar Scalar;
    typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
    const Index PacketSize = internal::packet_traits<Scalar>::size;
-    const Index Alignment = internal::evaluator<MatrixType>::Flags&AlignedBit ? Aligned : Unaligned;
+    const Index Alignment = internal::evaluator<MatrixType>::Alignment;
    PacketBlock<Packet> A;
    for (Index i=0; i<PacketSize; ++i)
      A.packet[i] = m.template packetByOuterInner<Alignment>(i,0);
@@ -317,14 +317,6 @@ inline void MatrixBase<Derived>::adjointInPlace()

 namespace internal {

-template<typename BinOp,typename NestedXpr,typename Rhs>
-struct blas_traits<SelfCwiseBinaryOp<BinOp,NestedXpr,Rhs> >
- : blas_traits<NestedXpr>
-{
-  typedef SelfCwiseBinaryOp<BinOp,NestedXpr,Rhs> XprType;
-  static inline const XprType extract(const XprType& x) { return x; }
-};
-
 template<bool DestIsTransposed, typename OtherDerived>
 struct check_transpose_aliasing_compile_time_selector
 {
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@@ -41,10 +41,6 @@ namespace Eigen {
  * \sa class PermutationMatrix
  */

-namespace internal {
-template<typename TranspositionType, typename MatrixType, int Side, bool Transposed=false> struct transposition_matrix_product_retval;
-}
-
 template<typename Derived>
 class TranspositionsBase
 {
@@ -66,7 +62,7 @@ class TranspositionsBase
      indices() = other.indices();
      return derived();
    }
-
+    
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    /** This is a special case of the templated operator=. Its purpose is to
      * prevent a default operator= from hiding the templated operator=.
@@ -79,7 +75,11 @@ class TranspositionsBase
    #endif

    /** \returns the number of transpositions */
-    inline Index size() const { return indices().size(); }
+    Index size() const { return indices().size(); }
+    /** \returns the number of rows of the equivalent permutation matrix */
+    Index rows() const { return indices().size(); }
+    /** \returns the number of columns of the equivalent permutation matrix */
+    Index cols() const { return indices().size(); }

    /** Direct access to the underlying index vector */
    inline const StorageIndex& coeff(Index i) const { return indices().coeff(i); }
@@ -147,9 +147,10 @@ class TranspositionsBase
 namespace internal {
 template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
 struct traits<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
+ : traits<PermutationMatrix<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
 {
  typedef Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
-  typedef _StorageIndex StorageIndex;
+  typedef TranspositionsStorage StorageKind;
 };
 }

@@ -178,7 +179,7 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim

    /** Generic constructor from expression of the transposition indices. */
    template<typename Other>
-    explicit inline Transpositions(const MatrixBase<Other>& a_indices) : m_indices(a_indices)
+    explicit inline Transpositions(const MatrixBase<Other>& indices) : m_indices(indices)
    {}

    /** Copies the \a other transpositions into \c *this */
@@ -218,9 +219,11 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
 namespace internal {
 template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int _PacketAccess>
 struct traits<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,_PacketAccess> >
+ : traits<PermutationMatrix<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
 {
  typedef Map<const Matrix<_StorageIndex,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1>, _PacketAccess> IndicesType;
  typedef _StorageIndex StorageIndex;
+  typedef TranspositionsStorage StorageKind;
 };
 }

@@ -275,9 +278,9 @@ class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,P
 namespace internal {
 template<typename _IndicesType>
 struct traits<TranspositionsWrapper<_IndicesType> >
+ : traits<PermutationWrapper<_IndicesType> >
 {
-  typedef typename _IndicesType::Scalar StorageIndex;
-  typedef _IndicesType IndicesType;
+  typedef TranspositionsStorage StorageKind;
 };
 }

@@ -292,8 +295,8 @@ class TranspositionsWrapper
    typedef typename Traits::IndicesType IndicesType;
    typedef typename IndicesType::Scalar StorageIndex;

-    explicit inline TranspositionsWrapper(IndicesType& a_indices)
-      : m_indices(a_indices)
+    explicit inline TranspositionsWrapper(IndicesType& indices)
+      : m_indices(indices)
    {}

    /** Copies the \a other transpositions into \c *this */
@@ -325,80 +328,43 @@ class TranspositionsWrapper
    const typename IndicesType::Nested m_indices;
 };

+
+
 /** \returns the \a matrix with the \a transpositions applied to the columns.
  */
-template<typename Derived, typename TranspositionsDerived>
-inline const internal::transposition_matrix_product_retval<TranspositionsDerived, Derived, OnTheRight>
-operator*(const MatrixBase<Derived>& matrix,
-          const TranspositionsBase<TranspositionsDerived> &transpositions)
+template<typename MatrixDerived, typename TranspositionsDerived>
+EIGEN_DEVICE_FUNC
+const Product<MatrixDerived, TranspositionsDerived, AliasFreeProduct>
+operator*(const MatrixBase<MatrixDerived> &matrix,
+          const TranspositionsBase<TranspositionsDerived>& transpositions)
 {
-  return internal::transposition_matrix_product_retval
-           <TranspositionsDerived, Derived, OnTheRight>
-           (transpositions.derived(), matrix.derived());
+  return Product<MatrixDerived, TranspositionsDerived, AliasFreeProduct>
+            (matrix.derived(), transpositions.derived());
 }

 /** \returns the \a matrix with the \a transpositions applied to the rows.
  */
-template<typename Derived, typename TranspositionDerived>
-inline const internal::transposition_matrix_product_retval
-               <TranspositionDerived, Derived, OnTheLeft>
-operator*(const TranspositionsBase<TranspositionDerived> &transpositions,
-          const MatrixBase<Derived>& matrix)
+template<typename TranspositionsDerived, typename MatrixDerived>
+EIGEN_DEVICE_FUNC
+const Product<TranspositionsDerived, MatrixDerived, AliasFreeProduct>
+operator*(const TranspositionsBase<TranspositionsDerived> &transpositions,
+          const MatrixBase<MatrixDerived>& matrix)
 {
-  return internal::transposition_matrix_product_retval
-           <TranspositionDerived, Derived, OnTheLeft>
-           (transpositions.derived(), matrix.derived());
+  return Product<TranspositionsDerived, MatrixDerived, AliasFreeProduct>
+            (transpositions.derived(), matrix.derived());
 }

+// Template partial specialization for transposed/inverse transpositions
+
 namespace internal {

-template<typename TranspositionType, typename MatrixType, int Side, bool Transposed>
-struct traits<transposition_matrix_product_retval<TranspositionType, MatrixType, Side, Transposed> >
-{
-  typedef typename MatrixType::PlainObject ReturnType;
-};
-
-template<typename TranspositionType, typename MatrixType, int Side, bool Transposed>
-struct transposition_matrix_product_retval
- : public ReturnByValue<transposition_matrix_product_retval<TranspositionType, MatrixType, Side, Transposed> >
-{
-    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
-    typedef typename TranspositionType::StorageIndex StorageIndex;
-
-    transposition_matrix_product_retval(const TranspositionType& tr, const MatrixType& matrix)
-      : m_transpositions(tr), m_matrix(matrix)
-    {}
-
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-
-    template<typename Dest> inline void evalTo(Dest& dst) const
-    {
-      const Index size = m_transpositions.size();
-      StorageIndex j = 0;
-
-      if(!(is_same<MatrixTypeNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_matrix)))
-        dst = m_matrix;
-
-      for(Index k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k)
-        if(Index(j=m_transpositions.coeff(k))!=k)
-        {
-          if(Side==OnTheLeft)
-            dst.row(k).swap(dst.row(j));
-          else if(Side==OnTheRight)
-            dst.col(k).swap(dst.col(j));
-        }
-    }
-
-  protected:
-    const TranspositionType& m_transpositions;
-    typename MatrixType::Nested m_matrix;
-};
+template<typename Derived>
+struct traits<Transpose<TranspositionsBase<Derived> > >
+ : traits<Derived>
+{};

 } // end namespace internal

-/* Template partial specialization for transposed/inverse transpositions */
-
 template<typename TranspositionsDerived>
 class Transpose<TranspositionsBase<TranspositionsDerived> >
 {
@@ -408,25 +374,29 @@ class Transpose<TranspositionsBase<TranspositionsDerived> >

    explicit Transpose(const TranspositionType& t) : m_transpositions(t) {}

-    inline int size() const { return m_transpositions.size(); }
+    Index size() const { return m_transpositions.size(); }
+    Index rows() const { return m_transpositions.size(); }
+    Index cols() const { return m_transpositions.size(); }

    /** \returns the \a matrix with the inverse transpositions applied to the columns.
      */
-    template<typename Derived> friend
-    inline const internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheRight, true>
-    operator*(const MatrixBase<Derived>& matrix, const Transpose& trt)
+    template<typename OtherDerived> friend
+    const Product<OtherDerived, Transpose, AliasFreeProduct>
+    operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trt)
    {
-      return internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheRight, true>(trt.m_transpositions, matrix.derived());
+      return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt.derived());
    }

    /** \returns the \a matrix with the inverse transpositions applied to the rows.
      */
-    template<typename Derived>
-    inline const internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheLeft, true>
-    operator*(const MatrixBase<Derived>& matrix) const
+    template<typename OtherDerived>
+    const Product<Transpose, OtherDerived, AliasFreeProduct>
+    operator*(const MatrixBase<OtherDerived>& matrix) const
    {
-      return internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheLeft, true>(m_transpositions, matrix.derived());
+      return Product<Transpose, OtherDerived, AliasFreeProduct>(*this, matrix.derived());
    }
+    
+    const TranspositionType& nestedExpression() const { return m_transpositions; }

  protected:
    const TranspositionType& m_transpositions;
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -19,9 +19,7 @@ template<int Side, typename TriangularType, typename Rhs> struct triangular_solv
  
 }

-/** \internal
-  *
-  * \class TriangularBase
+/** \class TriangularBase
  * \ingroup Core_Module
  *
  * \brief Base class for triangular part in a matrix
@@ -38,10 +36,14 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
      
      SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
-                                                   internal::traits<Derived>::ColsAtCompileTime>::ret)
-        /**< This is equal to the number of coefficients, i.e. the number of
+                                                   internal::traits<Derived>::ColsAtCompileTime>::ret),
+      /**< This is equal to the number of coefficients, i.e. the number of
          * rows times the number of columns, or to \a Dynamic if this is not
          * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
+      
+      MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
+                                                   internal::traits<Derived>::MaxColsAtCompileTime>::ret)
+        
    };
    typedef typename internal::traits<Derived>::Scalar Scalar;
    typedef typename internal::traits<Derived>::StorageKind StorageKind;
@@ -63,11 +65,11 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
    inline Index innerStride() const { return derived().innerStride(); }
    
    // dummy resize function
-    void resize(Index nbRows, Index nbCols)
+    void resize(Index rows, Index cols)
    {
-      EIGEN_UNUSED_VARIABLE(nbRows);
-      EIGEN_UNUSED_VARIABLE(nbCols);
-      eigen_assert(nbRows==rows() && nbCols==nbCols);
+      EIGEN_UNUSED_VARIABLE(rows);
+      EIGEN_UNUSED_VARIABLE(cols);
+      eigen_assert(rows==this->rows() && cols==this->cols());
    }

    EIGEN_DEVICE_FUNC
@@ -148,17 +150,17 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
 /** \class TriangularView
  * \ingroup Core_Module
  *
-  * \brief Base class for triangular part in a matrix
+  * \brief Expression of a triangular part in a matrix
  *
  * \param MatrixType the type of the object in which we are taking the triangular part
  * \param Mode the kind of triangular matrix expression to construct. Can be #Upper,
  *             #Lower, #UnitUpper, #UnitLower, #StrictlyUpper, or #StrictlyLower.
  *             This is in fact a bit field; it must have either #Upper or #Lower, 
-  *             and additionnaly it may have #UnitDiag or #ZeroDiag or neither.
+  *             and additionally it may have #UnitDiag or #ZeroDiag or neither.
  *
  * This class represents a triangular part of a matrix, not necessarily square. Strictly speaking, for rectangular
  * matrices one should speak of "trapezoid" parts. This class is the return type
-  * of MatrixBase::triangularView() and most of the time this is the only way it is used.
+  * of MatrixBase::triangularView() and SparseMatrixBase::triangularView(), and most of the time this is the only way it is used.
  *
  * \sa MatrixBase::triangularView()
  */
@@ -166,7 +168,7 @@ namespace internal {
 template<typename MatrixType, unsigned int _Mode>
 struct traits<TriangularView<MatrixType, _Mode> > : traits<MatrixType>
 {
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
  typedef typename remove_reference<MatrixTypeNested>::type MatrixTypeNestedNonRef;
  typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
  typedef typename MatrixType::PlainObject FullMatrixType;
@@ -306,6 +308,15 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
    MatrixTypeNested m_matrix;
 };

+/** \ingroup Core_Module
+  *
+  * \brief Base class for a triangular part in a \b dense matrix
+  *
+  * This class is an abstract base class of class TriangularView, and objects of type TriangularViewImpl cannot be instantiated.
+  * It extends class TriangularView with additional methods which available for dense expressions only.
+  *
+  * \sa class TriangularView, MatrixBase::triangularView()
+  */
 template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_MatrixType,_Mode,Dense>
  : public TriangularBase<TriangularView<_MatrixType, _Mode> >
 {
@@ -549,8 +560,8 @@ void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
  * The parameter \a Mode can have the following values: \c #Upper, \c #StrictlyUpper, \c #UnitUpper,
  * \c #Lower, \c #StrictlyLower, \c #UnitLower.
  *
-  * Example: \include MatrixBase_extract.cpp
-  * Output: \verbinclude MatrixBase_extract.out
+  * Example: \include MatrixBase_triangularView.cpp
+  * Output: \verbinclude MatrixBase_triangularView.out
  *
  * \sa class TriangularView
  */
@@ -653,7 +664,6 @@ struct unary_evaluator<TriangularView<MatrixType,Mode>, IndexBased>
 {
  typedef TriangularView<MatrixType,Mode> XprType;
  typedef evaluator<typename internal::remove_all<MatrixType>::type> Base;
-  typedef evaluator<XprType> type;
  unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {}
 };

@@ -723,8 +733,8 @@ EIGEN_DEVICE_FUNC void call_triangular_assignment_loop(const DstXprType& dst, co
 {
  eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
  
-  typedef typename evaluator<DstXprType>::type DstEvaluatorType;
-  typedef typename evaluator<SrcXprType>::type SrcEvaluatorType;
+  typedef evaluator<DstXprType> DstEvaluatorType;
+  typedef evaluator<SrcXprType> SrcEvaluatorType;

  DstEvaluatorType dstEvaluator(dst);
  SrcEvaluatorType srcEvaluator(src);
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -41,7 +41,7 @@ struct traits<PartialReduxExpr<MatrixType, MemberOp, Direction> >
  typedef typename traits<MatrixType>::StorageKind StorageKind;
  typedef typename traits<MatrixType>::XprKind XprKind;
  typedef typename MatrixType::Scalar InputScalar;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
  typedef typename remove_all<MatrixTypeNested>::type _MatrixTypeNested;
  enum {
    RowsAtCompileTime = Direction==Vertical   ? 1 : MatrixType::RowsAtCompileTime,
@@ -65,13 +65,16 @@ class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr<Matri
    typedef typename internal::traits<PartialReduxExpr>::MatrixTypeNested MatrixTypeNested;
    typedef typename internal::traits<PartialReduxExpr>::_MatrixTypeNested _MatrixTypeNested;

+    EIGEN_DEVICE_FUNC
    explicit PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
      : m_matrix(mat), m_functor(func) {}

+    EIGEN_DEVICE_FUNC
    Index rows() const { return (Direction==Vertical   ? 1 : m_matrix.rows()); }
+    EIGEN_DEVICE_FUNC
    Index cols() const { return (Direction==Horizontal ? 1 : m_matrix.cols()); }

-    EIGEN_STRONG_INLINE const Scalar coeff(Index i, Index j) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index i, Index j) const
    {
      if (Direction==Vertical)
        return m_functor(m_matrix.col(j));
@@ -79,7 +82,7 @@ class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr<Matri
        return m_functor(m_matrix.row(i));
    }

-    const Scalar coeff(Index index) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
    {
      if (Direction==Vertical)
        return m_functor(m_matrix.col(index));
@@ -100,7 +103,8 @@ class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr<Matri
    template<typename Scalar, int Size> struct Cost                     \
    { enum { value = COST }; };                                         \
    template<typename XprType>                                          \
-    EIGEN_STRONG_INLINE ResultType operator()(const XprType& mat) const \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                               \
+    ResultType operator()(const XprType& mat) const                     \
    { return mat.MEMBER(); } \
  }

@@ -124,13 +128,13 @@ EIGEN_MEMBER_FUNCTOR(prod, (Size-1)*NumTraits<Scalar>::MulCost);
 template <typename BinaryOp, typename Scalar>
 struct member_redux {
  typedef typename result_of<
-                     BinaryOp(Scalar)
+                     BinaryOp(Scalar,Scalar)
                   >::type  result_type;
  template<typename _Scalar, int Size> struct Cost
  { enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
-  explicit member_redux(const BinaryOp func) : m_functor(func) {}
+  EIGEN_DEVICE_FUNC explicit member_redux(const BinaryOp func) : m_functor(func) {}
  template<typename Derived>
-  inline result_type operator()(const DenseBase<Derived>& mat) const
+  EIGEN_DEVICE_FUNC inline result_type operator()(const DenseBase<Derived>& mat) const
  { return mat.redux(m_functor); }
  const BinaryOp m_functor;
 };
@@ -160,8 +164,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    typedef typename ExpressionType::Scalar Scalar;
    typedef typename ExpressionType::RealScalar RealScalar;
    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
-    typedef typename internal::conditional<internal::must_nest_by_value<ExpressionType>::ret,
-        ExpressionType, ExpressionType&>::type ExpressionTypeNested;
+    typedef typename internal::ref_selector<ExpressionType>::non_const_type ExpressionTypeNested;
    typedef typename internal::remove_all<ExpressionTypeNested>::type ExpressionTypeNestedCleaned;

    template<template<typename _Scalar> class Functor,
@@ -182,17 +185,18 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    };

    enum {
-      IsVertical   = (Direction==Vertical) ? 1 : 0,
-      IsHorizontal = (Direction==Horizontal) ? 1 : 0
+      isVertical   = (Direction==Vertical) ? 1 : 0,
+      isHorizontal = (Direction==Horizontal) ? 1 : 0
    };

  protected:

    /** \internal
      * \returns the i-th subvector according to the \c Direction */
-    typedef typename internal::conditional<Direction==Vertical,
+    typedef typename internal::conditional<isVertical,
                               typename ExpressionType::ColXpr,
                               typename ExpressionType::RowXpr>::type SubVector;
+    EIGEN_DEVICE_FUNC
    SubVector subVector(Index i)
    {
      return SubVector(m_matrix.derived(),i);
@@ -200,58 +204,62 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

    /** \internal
      * \returns the number of subvectors in the direction \c Direction */
+    EIGEN_DEVICE_FUNC
    Index subVectors() const
-    { return Direction==Vertical?m_matrix.cols():m_matrix.rows(); }
+    { return isVertical?m_matrix.cols():m_matrix.rows(); }

    template<typename OtherDerived> struct ExtendedType {
      typedef Replicate<OtherDerived,
-                        Direction==Vertical   ? 1 : ExpressionType::RowsAtCompileTime,
-                        Direction==Horizontal ? 1 : ExpressionType::ColsAtCompileTime> Type;
+                        isVertical   ? 1 : ExpressionType::RowsAtCompileTime,
+                        isHorizontal ? 1 : ExpressionType::ColsAtCompileTime> Type;
    };

    /** \internal
      * Replicates a vector to match the size of \c *this */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    typename ExtendedType<OtherDerived>::Type
    extendedTo(const DenseBase<OtherDerived>& other) const
    {
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Vertical, OtherDerived::MaxColsAtCompileTime==1),
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isVertical, OtherDerived::MaxColsAtCompileTime==1),
                          YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED)
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Horizontal, OtherDerived::MaxRowsAtCompileTime==1),
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isHorizontal, OtherDerived::MaxRowsAtCompileTime==1),
                          YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED)
      return typename ExtendedType<OtherDerived>::Type
                      (other.derived(),
-                       Direction==Vertical   ? 1 : m_matrix.rows(),
-                       Direction==Horizontal ? 1 : m_matrix.cols());
+                       isVertical   ? 1 : m_matrix.rows(),
+                       isHorizontal ? 1 : m_matrix.cols());
    }
    
    template<typename OtherDerived> struct OppositeExtendedType {
      typedef Replicate<OtherDerived,
-                        Direction==Horizontal ? 1 : ExpressionType::RowsAtCompileTime,
-                        Direction==Vertical   ? 1 : ExpressionType::ColsAtCompileTime> Type;
+                        isHorizontal ? 1 : ExpressionType::RowsAtCompileTime,
+                        isVertical   ? 1 : ExpressionType::ColsAtCompileTime> Type;
    };

    /** \internal
      * Replicates a vector in the opposite direction to match the size of \c *this */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    typename OppositeExtendedType<OtherDerived>::Type
    extendedToOpposite(const DenseBase<OtherDerived>& other) const
    {
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Horizontal, OtherDerived::MaxColsAtCompileTime==1),
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isHorizontal, OtherDerived::MaxColsAtCompileTime==1),
                          YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED)
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Vertical, OtherDerived::MaxRowsAtCompileTime==1),
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isVertical, OtherDerived::MaxRowsAtCompileTime==1),
                          YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED)
      return typename OppositeExtendedType<OtherDerived>::Type
                      (other.derived(),
-                       Direction==Horizontal  ? 1 : m_matrix.rows(),
-                       Direction==Vertical    ? 1 : m_matrix.cols());
+                       isHorizontal  ? 1 : m_matrix.rows(),
+                       isVertical    ? 1 : m_matrix.cols());
    }

  public:
-
+    EIGEN_DEVICE_FUNC
    explicit inline VectorwiseOp(ExpressionType& matrix) : m_matrix(matrix) {}

    /** \internal */
+    EIGEN_DEVICE_FUNC
    inline const ExpressionType& _expression() const { return m_matrix; }

    /** \returns a row or column vector expression of \c *this reduxed by \a func
@@ -262,6 +270,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise()
      */
    template<typename BinaryOp>
+    EIGEN_DEVICE_FUNC
    const typename ReduxReturnType<BinaryOp>::Type
    redux(const BinaryOp& func = BinaryOp()) const
    { return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func)); }
@@ -290,6 +299,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * Output: \verbinclude PartialRedux_minCoeff.out
      *
      * \sa DenseBase::minCoeff() */
+    EIGEN_DEVICE_FUNC
    const MinCoeffReturnType minCoeff() const
    { return MinCoeffReturnType(_expression()); }

@@ -302,6 +312,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * Output: \verbinclude PartialRedux_maxCoeff.out
      *
      * \sa DenseBase::maxCoeff() */
+    EIGEN_DEVICE_FUNC
    const MaxCoeffReturnType maxCoeff() const
    { return MaxCoeffReturnType(_expression()); }

@@ -313,6 +324,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * Output: \verbinclude PartialRedux_squaredNorm.out
      *
      * \sa DenseBase::squaredNorm() */
+    EIGEN_DEVICE_FUNC
    const SquaredNormReturnType squaredNorm() const
    { return SquaredNormReturnType(_expression()); }

@@ -324,6 +336,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * Output: \verbinclude PartialRedux_norm.out
      *
      * \sa DenseBase::norm() */
+    EIGEN_DEVICE_FUNC
    const NormReturnType norm() const
    { return NormReturnType(_expression()); }

@@ -334,6 +347,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * This is a vector with real entries, even if the original matrix has complex entries.
      *
      * \sa DenseBase::blueNorm() */
+    EIGEN_DEVICE_FUNC
    const BlueNormReturnType blueNorm() const
    { return BlueNormReturnType(_expression()); }

@@ -344,6 +358,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * This is a vector with real entries, even if the original matrix has complex entries.
      *
      * \sa DenseBase::stableNorm() */
+    EIGEN_DEVICE_FUNC
    const StableNormReturnType stableNorm() const
    { return StableNormReturnType(_expression()); }

@@ -354,6 +369,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * This is a vector with real entries, even if the original matrix has complex entries.
      *
      * \sa DenseBase::hypotNorm() */
+    EIGEN_DEVICE_FUNC
    const HypotNormReturnType hypotNorm() const
    { return HypotNormReturnType(_expression()); }

@@ -364,6 +380,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * Output: \verbinclude PartialRedux_sum.out
      *
      * \sa DenseBase::sum() */
+    EIGEN_DEVICE_FUNC
    const SumReturnType sum() const
    { return SumReturnType(_expression()); }

@@ -371,6 +388,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    * of each column (or row) of the referenced expression.
    *
    * \sa DenseBase::mean() */
+    EIGEN_DEVICE_FUNC
    const MeanReturnType mean() const
    { return MeanReturnType(_expression()); }

@@ -379,6 +397,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * This expression can be assigned to a vector with entries of type \c bool.
      *
      * \sa DenseBase::all() */
+    EIGEN_DEVICE_FUNC
    const AllReturnType all() const
    { return AllReturnType(_expression()); }

@@ -387,6 +406,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * This expression can be assigned to a vector with entries of type \c bool.
      *
      * \sa DenseBase::any() */
+    EIGEN_DEVICE_FUNC
    const AnyReturnType any() const
    { return Any(_expression()); }

@@ -399,6 +419,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * Output: \verbinclude PartialRedux_count.out
      *
      * \sa DenseBase::count() */
+    EIGEN_DEVICE_FUNC
    const CountReturnType count() const
    { return CountReturnType(_expression()); }

@@ -409,6 +430,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * Output: \verbinclude PartialRedux_prod.out
      *
      * \sa DenseBase::prod() */
+    EIGEN_DEVICE_FUNC
    const ProdReturnType prod() const
    { return ProdReturnType(_expression()); }

@@ -420,10 +442,12 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * Output: \verbinclude Vectorwise_reverse.out
      *
      * \sa DenseBase::reverse() */
+    EIGEN_DEVICE_FUNC
    const ReverseReturnType reverse() const
    { return ReverseReturnType( _expression() ); }

-    typedef Replicate<ExpressionType,Direction==Vertical?Dynamic:1,Direction==Horizontal?Dynamic:1> ReplicateReturnType;
+    typedef Replicate<ExpressionType,(isVertical?Dynamic:1),(isHorizontal?Dynamic:1)> ReplicateReturnType;
+    EIGEN_DEVICE_FUNC
    const ReplicateReturnType replicate(Index factor) const;

    /**
@@ -435,17 +459,20 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * \sa VectorwiseOp::replicate(Index), DenseBase::replicate(), class Replicate
      */
    // NOTE implemented here because of sunstudio's compilation errors
-    template<int Factor> const Replicate<ExpressionType,(IsVertical?Factor:1),(IsHorizontal?Factor:1)>
+    // isVertical*Factor+isHorizontal instead of (isVertical?Factor:1) to handle CUDA bug with ternary operator
+    template<int Factor> const Replicate<ExpressionType,isVertical*Factor+isHorizontal,isHorizontal*Factor+isVertical>
+    EIGEN_DEVICE_FUNC
    replicate(Index factor = Factor) const
    {
-      return Replicate<ExpressionType,Direction==Vertical?Factor:1,Direction==Horizontal?Factor:1>
-          (_expression(),Direction==Vertical?factor:1,Direction==Horizontal?factor:1);
+      return Replicate<ExpressionType,(isVertical?Factor:1),(isHorizontal?Factor:1)>
+          (_expression(),isVertical?factor:1,isHorizontal?factor:1);
    }

 /////////// Artithmetic operators ///////////

    /** Copies the vector \a other to each subvector of \c *this */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    ExpressionType& operator=(const DenseBase<OtherDerived>& other)
    {
      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
@@ -456,6 +483,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

    /** Adds the vector \a other to each subvector of \c *this */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    ExpressionType& operator+=(const DenseBase<OtherDerived>& other)
    {
      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
@@ -465,6 +493,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

    /** Substracts the vector \a other to each subvector of \c *this */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    ExpressionType& operator-=(const DenseBase<OtherDerived>& other)
    {
      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
@@ -474,6 +503,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

    /** Multiples each subvector of \c *this by the vector \a other */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    ExpressionType& operator*=(const DenseBase<OtherDerived>& other)
    {
      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
@@ -485,6 +515,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

    /** Divides each subvector of \c *this by the vector \a other */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    ExpressionType& operator/=(const DenseBase<OtherDerived>& other)
    {
      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
@@ -495,7 +526,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    }

    /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */
-    template<typename OtherDerived> EIGEN_STRONG_INLINE
+    template<typename OtherDerived> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
    CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
                  const ExpressionTypeNestedCleaned,
                  const typename ExtendedType<OtherDerived>::Type>
@@ -508,6 +539,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

    /** Returns the expression of the difference between each subvector of \c *this and the vector \a other */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
                  const ExpressionTypeNestedCleaned,
                  const typename ExtendedType<OtherDerived>::Type>
@@ -520,10 +552,11 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

    /** Returns the expression where each subvector is the product of the vector \a other
      * by the corresponding subvector of \c *this */
-    template<typename OtherDerived> EIGEN_STRONG_INLINE
+    template<typename OtherDerived> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
    CwiseBinaryOp<internal::scalar_product_op<Scalar>,
                  const ExpressionTypeNestedCleaned,
                  const typename ExtendedType<OtherDerived>::Type>
+    EIGEN_DEVICE_FUNC
    operator*(const DenseBase<OtherDerived>& other) const
    {
      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
@@ -535,6 +568,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    /** Returns the expression where each subvector is the quotient of the corresponding
      * subvector of \c *this by the vector \a other */
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    CwiseBinaryOp<internal::scalar_quotient_op<Scalar>,
                  const ExpressionTypeNestedCleaned,
                  const typename ExtendedType<OtherDerived>::Type>
@@ -550,6 +584,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * The referenced matrix is \b not modified.
      * \sa MatrixBase::normalized(), normalize()
      */
+    EIGEN_DEVICE_FUNC
    CwiseBinaryOp<internal::scalar_quotient_op<Scalar>,
                  const ExpressionTypeNestedCleaned,
                  const typename OppositeExtendedType<typename ReturnType<internal::member_norm,RealScalar>::Type>::Type>
@@ -559,10 +594,12 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    /** Normalize in-place each row or columns of the referenced matrix.
      * \sa MatrixBase::normalize(), normalized()
      */
-    void normalize() {
+    EIGEN_DEVICE_FUNC void normalize() {
      m_matrix = this->normalized();
    }

+    EIGEN_DEVICE_FUNC inline void reverseInPlace();
+
 /////////// Geometry module ///////////

    typedef Homogeneous<ExpressionType,Direction> HomogeneousReturnType;
@@ -570,6 +607,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

    typedef typename ExpressionType::PlainObject CrossReturnType;
    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
    const CrossReturnType cross(const MatrixBase<OtherDerived>& other) const;

    enum {
@@ -600,19 +638,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    ExpressionTypeNested m_matrix;
 };

-/** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
-  *
-  * Example: \include MatrixBase_colwise.cpp
-  * Output: \verbinclude MatrixBase_colwise.out
-  *
-  * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
-  */
-template<typename Derived>
-inline const typename DenseBase<Derived>::ConstColwiseReturnType
-DenseBase<Derived>::colwise() const
-{
-  return ConstColwiseReturnType(derived());
-}
+//const colwise moved to DenseBase.h due to CUDA compiler bug
+

 /** \returns a writable VectorwiseOp wrapper of *this providing additional partial reduction operations
  *
@@ -625,19 +652,8 @@ DenseBase<Derived>::colwise()
  return ColwiseReturnType(derived());
 }

-/** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
-  *
-  * Example: \include MatrixBase_rowwise.cpp
-  * Output: \verbinclude MatrixBase_rowwise.out
-  *
-  * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
-  */
-template<typename Derived>
-inline const typename DenseBase<Derived>::ConstRowwiseReturnType
-DenseBase<Derived>::rowwise() const
-{
-  return ConstRowwiseReturnType(derived());
-}
+//const rowwise moved to DenseBase.h due to CUDA compiler bug
+

 /** \returns a writable VectorwiseOp wrapper of *this providing additional partial reduction operations
  *
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -22,6 +22,7 @@ struct visitor_impl
    row = (UnrollCount-1) % Derived::RowsAtCompileTime
  };

+  EIGEN_DEVICE_FUNC
  static inline void run(const Derived &mat, Visitor& visitor)
  {
    visitor_impl<Visitor, Derived, UnrollCount-1>::run(mat, visitor);
@@ -32,6 +33,7 @@ struct visitor_impl
 template<typename Visitor, typename Derived>
 struct visitor_impl<Visitor, Derived, 1>
 {
+  EIGEN_DEVICE_FUNC
  static inline void run(const Derived &mat, Visitor& visitor)
  {
    return visitor.init(mat.coeff(0, 0), 0, 0);
@@ -41,6 +43,7 @@ struct visitor_impl<Visitor, Derived, 1>
 template<typename Visitor, typename Derived>
 struct visitor_impl<Visitor, Derived, Dynamic>
 {
+  EIGEN_DEVICE_FUNC
  static inline void run(const Derived& mat, Visitor& visitor)
  {
    visitor.init(mat.coeff(0,0), 0, 0);
@@ -57,6 +60,7 @@ template<typename XprType>
 class visitor_evaluator
 {
 public:
+  EIGEN_DEVICE_FUNC
  explicit visitor_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
  
  typedef typename XprType::Scalar Scalar;
@@ -67,15 +71,15 @@ public:
    CoeffReadCost = internal::evaluator<XprType>::CoeffReadCost
  };
  
-  Index rows() const { return m_xpr.rows(); }
-  Index cols() const { return m_xpr.cols(); }
-  Index size() const { return m_xpr.size(); }
+  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }

-  CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
  { return m_evaluator.coeff(row, col); }
  
 protected:
-  typename internal::evaluator<XprType>::nestedType m_evaluator;
+  internal::evaluator<XprType> m_evaluator;
  const XprType &m_xpr;
 };
 } // end namespace internal
@@ -99,6 +103,7 @@ protected:
  */
 template<typename Derived>
 template<typename Visitor>
+EIGEN_DEVICE_FUNC
 void DenseBase<Derived>::visit(Visitor& visitor) const
 {
  typedef typename internal::visitor_evaluator<Derived> ThisEvaluator;
@@ -125,6 +130,7 @@ struct coeff_visitor
  typedef typename Derived::Scalar Scalar;
  Index row, col;
  Scalar res;
+  EIGEN_DEVICE_FUNC
  inline void init(const Scalar& value, Index i, Index j)
  {
    res = value;
@@ -142,6 +148,7 @@ template <typename Derived>
 struct min_coeff_visitor : coeff_visitor<Derived>
 {
  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
  void operator() (const Scalar& value, Index i, Index j)
  {
    if(value < this->res)
@@ -168,7 +175,8 @@ struct functor_traits<min_coeff_visitor<Scalar> > {
 template <typename Derived>
 struct max_coeff_visitor : coeff_visitor<Derived>
 {
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Scalar Scalar; 
+  EIGEN_DEVICE_FUNC
  void operator() (const Scalar& value, Index i, Index j)
  {
    if(value > this->res)
@@ -196,6 +204,7 @@ struct functor_traits<max_coeff_visitor<Scalar> > {
  */
 template<typename Derived>
 template<typename IndexType>
+EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
 {
@@ -213,6 +222,7 @@ DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
  */
 template<typename Derived>
 template<typename IndexType>
+EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff(IndexType* index) const
 {
@@ -230,6 +240,7 @@ DenseBase<Derived>::minCoeff(IndexType* index) const
  */
 template<typename Derived>
 template<typename IndexType>
+EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
 {
@@ -247,6 +258,7 @@ DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
  */
 template<typename Derived>
 template<typename IndexType>
+EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff(IndexType* index) const
 {
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -45,7 +45,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
  };
 };

-template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; };

 template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
@@ -267,7 +267,7 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
  };
 };

-template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; };

 template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -271,6 +271,86 @@ pexp<Packet8f>(const Packet8f& _x) {
  return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x);
 }

+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
+pexp<Packet4d>(const Packet4d& _x) {
+  Packet4d x = _x;
+
+  _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
+  _EIGEN_DECLARE_CONST_Packet4d(2, 2.0);
+  _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
+
+  _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437);
+  _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6);
+  _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
+
+  Packet4d tmp, fx;
+
+  // clamp x
+  x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo);
+  // Express exp(x) as exp(g + n*log(2)).
+  fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half);
+
+  // Get the integer modulus of log(2), i.e. the "n" described above.
+  fx = _mm256_floor_pd(fx);
+
+  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
+  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
+  // digits right.
+  tmp = pmul(fx, p4d_cephes_exp_C1);
+  Packet4d z = pmul(fx, p4d_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  Packet4d x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial of the rational interpolant.
+  Packet4d px = p4d_cephes_exp_p0;
+  px = pmadd(px, x2, p4d_cephes_exp_p1);
+  px = pmadd(px, x2, p4d_cephes_exp_p2);
+  px = pmul(px, x);
+
+  // Evaluate the denominator polynomial of the rational interpolant.
+  Packet4d qx = p4d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p4d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p4d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p4d_cephes_exp_q3);
+
+  // I don't really get this bit, copied from the SSE2 routines, so...
+  // TODO(gonnet): Figure out what is going on here, perhaps find a better
+  // rational interpolant?
+  x = _mm256_div_pd(px, psub(qx, px));
+  x = pmadd(p4d_2, x, p4d_1);
+
+  // Build e=2^n by constructing the exponents in a 128-bit vector and
+  // shifting them to where they belong in double-precision values.
+  __m128i emm0 = _mm256_cvtpd_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, p4i_1023);
+  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
+  __m128i lo = _mm_slli_epi64(emm0, 52);
+  __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
+  __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
+  e = _mm256_insertf128_si256(e, hi, 1);
+
+  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
+  // non-finite values in the input.
+  return pmax(pmul(x, Packet4d(e)), _x);
+}
+
 // Functions for sqrt.
 // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
 // of Newton's method, at a cost of 1-2 bits of precision as opposed to the
@@ -300,15 +380,59 @@ psqrt<Packet8f>(const Packet8f& _x) {
  return pmul(_x, x);
 }
 #else
-template <>
-EIGEN_STRONG_INLINE Packet8f psqrt<Packet8f>(const Packet8f& x) {
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f psqrt<Packet8f>(const Packet8f& x) {
  return _mm256_sqrt_ps(x);
 }
 #endif
-template <>
-EIGEN_STRONG_INLINE Packet4d psqrt<Packet4d>(const Packet4d& x) {
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4d psqrt<Packet4d>(const Packet4d& x) {
  return _mm256_sqrt_pd(x);
 }
+#if EIGEN_FAST_MATH
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
+ _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000);
+  _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
+  _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
+
+  Packet8f neg_half = pmul(_x, p8f_minus_half);
+
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
+  Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x));
+
+  // Fill in NaNs and Infs for the negative/zero entries.
+  Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ);
+  Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask);
+  Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan),
+                                        _mm256_and_ps(zero_mask, p8f_inf));
+
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five));
+
+  // Insert NaNs and Infs in all the right places.
+  return _mm256_or_ps(x, infs_and_nans);
+}
+
+#else
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f prsqrt<Packet8f>(const Packet8f& x) {
+  _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
+  return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(x));
+}
+#endif
+
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4d prsqrt<Packet4d>(const Packet4d& x) {
+  _EIGEN_DECLARE_CONST_Packet4d(one, 1.0);
+  return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(x));
+}
+

 }  // end namespace internal

--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -60,11 +60,12 @@ template<> struct packet_traits<float>  : default_packet_traits
    HasHalfPacket = 1,

    HasDiv  = 1,
-    HasSin  = 1,
+    HasSin  = EIGEN_FAST_MATH,
    HasCos  = 0,
    HasLog  = 1,
    HasExp  = 1,
    HasSqrt = 1,
+    HasRsqrt = 1,
    HasBlend = 1
  };
 };
@@ -79,8 +80,9 @@ template<> struct packet_traits<double> : default_packet_traits
    HasHalfPacket = 1,

    HasDiv  = 1,
-    HasExp  = 0,
+    HasExp  = 1,
    HasSqrt = 1,
+    HasRsqrt = 1,
    HasBlend = 1
  };
 };
@@ -98,9 +100,9 @@ template<> struct packet_traits<int>    : default_packet_traits
 };
 */

-template<> struct unpacket_traits<Packet8f> { typedef float  type; typedef Packet4f half; enum {size=8}; };
-template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4}; };
-template<> struct unpacket_traits<Packet8i> { typedef int    type; typedef Packet4i half; enum {size=8}; };
+template<> struct unpacket_traits<Packet8f> { typedef float  type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; };
+template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; };
+template<> struct unpacket_traits<Packet8i> { typedef int    type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; };

 template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float&  from) { return _mm256_set1_ps(from); }
 template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
@@ -109,8 +111,8 @@ template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int&    from) { re
 template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float*  from) { return _mm256_broadcast_ss(from); }
 template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }

-template<> EIGEN_STRONG_INLINE Packet8f plset<float>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
-template<> EIGEN_STRONG_INLINE Packet4d plset<double>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
+template<> EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
+template<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }

 template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
@@ -432,26 +434,30 @@ struct palign_impl<Offset,Packet8f>
    if (Offset==1)
    {
      first = _mm256_blend_ps(first, second, 1);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
-      first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0x88);
+      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
+      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
+      first = _mm256_blend_ps(tmp1, tmp2, 0x88);
    }
    else if (Offset==2)
    {
      first = _mm256_blend_ps(first, second, 3);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
-      first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xcc);
+      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
+      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
+      first = _mm256_blend_ps(tmp1, tmp2, 0xcc);
    }
    else if (Offset==3)
    {
      first = _mm256_blend_ps(first, second, 7);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
-      first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xee);
+      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
+      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
+      first = _mm256_blend_ps(tmp1, tmp2, 0xee);
    }
    else if (Offset==4)
    {
      first = _mm256_blend_ps(first, second, 15);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
-      first = _mm256_permute_ps(_mm256_permute2f128_ps (tmp, tmp, 1), _MM_SHUFFLE(3,2,1,0));
+      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
+      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
+      first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0));
    }
    else if (Offset==5)
    {
--- a/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -0,0 +1,51 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_AVX_H
+#define EIGEN_TYPE_CASTING_AVX_H
+
+namespace Eigen {
+
+namespace internal {
+
+// For now we use SSE to handle integers, so we can't use AVX instructions to cast
+// from int to float
+template <>
+struct type_casting_traits<float, int> {
+  enum {
+    VectorizedCast = 0,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<int, float> {
+  enum {
+    VectorizedCast = 0,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+
+
+template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
+  return _mm256_cvtps_epi32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
+  return _mm256_cvtepi32_ps(a);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_AVX_H
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -53,7 +53,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
  };
 };

-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };

 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
@@ -275,7 +275,7 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
  };
 };

-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };

 template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
@@ -408,7 +408,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, con
  // TODO optimize it for AltiVec
  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
  Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
-  return Packet1cd(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_COMPLEX32_REV))));
+  return Packet1cd(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_REVERSE64))));
 }

 EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@@ -0,0 +1,290 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+#ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H
+#define EIGEN_MATH_FUNCTIONS_ALTIVEC_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f plog<Packet4f>(const Packet4f& _x)
+{
+  Packet4f x = _x;
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+  _EIGEN_DECLARE_CONST_Packet4i(23, 23);
+
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
+
+  /* the smallest non denormalized float number */
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000); // -1.f/0.f
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan,     0xffffffff);
+  
+  /* natural logarithm computed for 4 simultaneous float
+    return NaN for x <= 0
+  */
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+
+
+  Packet4i emm0;
+
+  /* isvalid_mask is 0 if x < 0 or x is NaN. */
+  Packet4ui isvalid_mask = reinterpret_cast<Packet4ui>(vec_cmpge(x, p4f_ZERO));
+  Packet4ui iszero_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(x, p4f_ZERO));
+
+  x = pmax(x, p4f_min_norm_pos);  /* cut off denormalized stuff */
+  emm0 = vec_sr(reinterpret_cast<Packet4i>(x),
+                reinterpret_cast<Packet4ui>(p4i_23));
+
+  /* keep only the fractional part */
+  x = pand(x, p4f_inv_mant_mask);
+  x = por(x, p4f_half);
+
+  emm0 = psub(emm0, p4i_0x7f);
+  Packet4f e = padd(vec_ctf(emm0, 0), p4f_1);
+
+  /* part2:
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  Packet4f mask = reinterpret_cast<Packet4f>(vec_cmplt(x, p4f_cephes_SQRTHF));
+  Packet4f tmp = pand(x, mask);
+  x = psub(x, p4f_1);
+  e = psub(e, pand(p4f_1, mask));
+  x = padd(x, tmp);
+
+  Packet4f x2 = pmul(x,x);
+  Packet4f x3 = pmul(x2,x);
+
+  Packet4f y, y1, y2;
+  y  = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
+  y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
+  y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
+  y  = pmadd(y , x, p4f_cephes_log_p2);
+  y1 = pmadd(y1, x, p4f_cephes_log_p5);
+  y2 = pmadd(y2, x, p4f_cephes_log_p8);
+  y = pmadd(y, x3, y1);
+  y = pmadd(y, x3, y2);
+  y = pmul(y, x3);
+
+  y1 = pmul(e, p4f_cephes_log_q1);
+  tmp = pmul(x2, p4f_half);
+  y = padd(y, y1);
+  x = psub(x, tmp);
+  y2 = pmul(e, p4f_cephes_log_q2);
+  x = padd(x, y);
+  x = padd(x, y2);
+  // negative arg will be NAN, 0 will be -INF
+  x = vec_sel(x, p4f_minus_inf, iszero_mask);
+  x = vec_sel(p4f_minus_nan, x, isvalid_mask);
+  return x;
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pexp<Packet4f>(const Packet4f& _x)
+{
+  Packet4f x = _x;
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+  _EIGEN_DECLARE_CONST_Packet4i(23, 23);
+
+
+  _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
+  _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+
+  Packet4f tmp, fx;
+  Packet4i emm0;
+
+  // clamp x
+  x = vec_max(vec_min(x, p4f_exp_hi), p4f_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
+
+  fx = vec_floor(fx);
+
+  tmp = pmul(fx, p4f_cephes_exp_C1);
+  Packet4f z = pmul(fx, p4f_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  z = pmul(x,x);
+
+  Packet4f y = p4f_cephes_exp_p0;
+  y = pmadd(y, x, p4f_cephes_exp_p1);
+  y = pmadd(y, x, p4f_cephes_exp_p2);
+  y = pmadd(y, x, p4f_cephes_exp_p3);
+  y = pmadd(y, x, p4f_cephes_exp_p4);
+  y = pmadd(y, x, p4f_cephes_exp_p5);
+  y = pmadd(y, z, x);
+  y = padd(y, p4f_1);
+
+  // build 2^n
+  emm0 = vec_cts(fx, 0);
+  emm0 = vec_add(emm0, p4i_0x7f);
+  emm0 = vec_sl(emm0, reinterpret_cast<Packet4ui>(p4i_23));
+
+  // Altivec's max & min operators just drop silent NaNs. Check NaNs in 
+  // inputs and return them unmodified.
+  Packet4ui isnumber_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(_x, _x));
+  return vec_sel(_x, pmax(pmul(y, reinterpret_cast<Packet4f>(emm0)), _x),
+                 isnumber_mask);
+}
+
+#ifdef __VSX__
+// VSX support varies between different compilers and even different
+// versions of the same compiler.  For gcc version >= 4.9.3, we can use
+// vec_cts to efficiently convert Packet2d to Packet2l.  Otherwise, use
+// a slow version that works with older compilers. 
+static inline Packet2l ConvertToPacket2l(const Packet2d& x) {
+#if EIGEN_GNUC_AT_LEAST(5, 0) || \
+    (EIGEN_GNUC_AT(4, 9) && __GNUC_PATCHLEVEL__ >= 3)
+  return vec_cts(x, 0);    // TODO: check clang version.
+#else
+  double tmp[2];
+  memcpy(tmp, &x, sizeof(tmp));
+  Packet2l l = { static_cast<long long>(tmp[0]),
+                 static_cast<long long>(tmp[1]) };
+  return l;
+#endif
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d pexp<Packet2d>(const Packet2d& _x)
+{
+  Packet2d x = _x;
+
+  _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
+  _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
+  _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
+
+  _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
+  _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
+
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
+
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
+
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
+
+  Packet2d tmp, fx;
+  Packet2l emm0;
+
+  // clamp x
+  x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
+
+  fx = vec_floor(fx);
+
+  tmp = pmul(fx, p2d_cephes_exp_C1);
+  Packet2d z = pmul(fx, p2d_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  Packet2d x2 = pmul(x,x);
+
+  Packet2d px = p2d_cephes_exp_p0;
+  px = pmadd(px, x2, p2d_cephes_exp_p1);
+  px = pmadd(px, x2, p2d_cephes_exp_p2);
+  px = pmul (px, x);
+
+  Packet2d qx = p2d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
+
+  x = pdiv(px,psub(qx,px));
+  x = pmadd(p2d_2,x,p2d_1);
+
+  // build 2^n
+  emm0 = ConvertToPacket2l(fx);
+
+#ifdef __POWER8_VECTOR__ 
+  static const Packet2l p2l_1023 = { 1023, 1023 };
+  static const Packet2ul p2ul_52 = { 52, 52 };
+
+  emm0 = vec_add(emm0, p2l_1023);
+  emm0 = vec_sl(emm0, p2ul_52);
+#else
+  // Code is a bit complex for POWER7.  There is actually a
+  // vec_xxsldi intrinsic but it is not supported by some gcc versions.
+  // So we shift (52-32) bits and do a word swap with zeros.
+  _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
+  _EIGEN_DECLARE_CONST_Packet4i(20, 20);    // 52 - 32
+
+  Packet4i emm04i = reinterpret_cast<Packet4i>(emm0);
+  emm04i = vec_add(emm04i, p4i_1023);
+  emm04i = vec_sl(emm04i, reinterpret_cast<Packet4ui>(p4i_20));
+  static const Packet16uc perm = {
+    0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, 
+    0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
+#ifdef  _BIG_ENDIAN
+  emm0 = reinterpret_cast<Packet2l>(vec_perm(p4i_ZERO, emm04i, perm));
+#else
+  emm0 = reinterpret_cast<Packet2l>(vec_perm(emm04i, p4i_ZERO, perm));
+#endif
+
+#endif
+
+  // Altivec's max & min operators just drop silent NaNs. Check NaNs in 
+  // inputs and return them unmodified.
+  Packet2ul isnumber_mask = reinterpret_cast<Packet2ul>(vec_cmpeq(_x, _x));
+  return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x),
+                 isnumber_mask);
+}
+#endif
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_ALTIVEC_H
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -59,6 +59,9 @@ typedef __vector unsigned char  Packet16uc;
 #define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
  Packet2l p2l_##NAME = pset1<Packet2l>(X)

+#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+  const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
+
 #define DST_CHAN 1
 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))

@@ -66,10 +69,12 @@ typedef __vector unsigned char  Packet16uc;
 // These constants are endian-agnostic
 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
+#ifndef __VSX__
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
+static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
+#endif
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
-static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
 static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}

 static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
@@ -130,8 +135,8 @@ template<> struct packet_traits<float>  : default_packet_traits
    HasDiv  = 1,
    HasSin  = 0,
    HasCos  = 0,
-    HasLog  = 0,
-    HasExp  = 0,
+    HasLog  = 1,
+    HasExp  = 1,
    HasSqrt = 0
  };
 };
@@ -148,8 +153,8 @@ template<> struct packet_traits<int>    : default_packet_traits
 };


-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };

 inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
 {
@@ -289,8 +294,8 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
  to[3*stride] = ai[3];
 }

-template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
-template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a)     { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)   { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }

 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); }
@@ -751,12 +756,12 @@ template<> struct packet_traits<double> : default_packet_traits
    HasHalfPacket = 0,

    HasDiv  = 1,
-    HasExp  = 0,
+    HasExp  = 1,
    HasSqrt = 0
  };
 };

-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };


 inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
@@ -807,7 +812,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to,
  to[0*stride] = af[0];
  to[1*stride] = af[1];
 }
-template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return vec_add(pset1<Packet2d>(a), p2d_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return vec_add(pset1<Packet2d>(a), p2d_COUNTDOWN); }

 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_add(a,b); }

--- a/Eigen/src/Core/arch/CMakeLists.txt
+++ b/Eigen/src/Core/arch/CMakeLists.txt
@@ -1,5 +1,9 @@
-ADD_SUBDIRECTORY(SSE)
 ADD_SUBDIRECTORY(AltiVec)
-ADD_SUBDIRECTORY(NEON)
 ADD_SUBDIRECTORY(AVX)
+ADD_SUBDIRECTORY(CUDA)
 ADD_SUBDIRECTORY(Default)
+ADD_SUBDIRECTORY(NEON)
+ADD_SUBDIRECTORY(SSE)
+
+
+
--- a/Eigen/src/Core/arch/CUDA/CMakeLists.txt
+++ b/Eigen/src/Core/arch/CUDA/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_Core_arch_CUDA_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_Core_arch_CUDA_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/CUDA COMPONENT Devel
+)
--- a/Eigen/src/Core/arch/CUDA/MathFunctions.h
+++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h
@@ -18,49 +18,49 @@ namespace internal {
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
 #if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
-template<> EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 plog<float4>(const float4& a)
 {
  return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w));
 }

-template<> EIGEN_STRONG_INLINE
+template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 plog<double2>(const double2& a)
 {
  return make_double2(log(a.x), log(a.y));
 }

-template<> EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 pexp<float4>(const float4& a)
 {
  return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w));
 }

-template<> EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 pexp<double2>(const double2& a)
 {
  return make_double2(exp(a.x), exp(a.y));
 }

-template<> EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 psqrt<float4>(const float4& a)
 {
  return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
 }

-template<> EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 psqrt<double2>(const double2& a)
 {
  return make_double2(sqrt(a.x), sqrt(a.y));
 }

-template<> EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 prsqrt<float4>(const float4& a)
 {
  return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w));
 }

-template<> EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 prsqrt<double2>(const double2& a)
 {
  return make_double2(rsqrt(a.x), rsqrt(a.y));
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -65,8 +65,8 @@ template<> struct packet_traits<double> : default_packet_traits
 };


-template<> struct unpacket_traits<float4> { typedef float  type; enum {size=4}; typedef float4 half; };
-template<> struct unpacket_traits<double2> { typedef double type; enum {size=2}; typedef double2 half; };
+template<> struct unpacket_traits<float4>  { typedef float  type; enum {size=4, alignment=Aligned16}; typedef float4 half; };
+template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; };

 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float&  from) {
  return make_float4(from, from, from, from);
@@ -76,10 +76,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const do
 }


-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float>(const float& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
  return make_float4(a, a+1, a+2, a+3);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double>(const double& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
  return make_double2(a, a+1);
 }

@@ -197,21 +197,21 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(cons
 }
 #endif

-template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
  return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
 }

-template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
  return make_double2(from[0*stride], from[1*stride]);
 }

-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
  to[stride*0] = from.x;
  to[stride*1] = from.y;
  to[stride*2] = from.z;
  to[stride*3] = from.w;
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
  to[stride*0] = from.x;
  to[stride*1] = from.y;
 }
@@ -245,14 +245,14 @@ template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a)
 }

 template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
-  return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w));
+  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
 }
 template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
-  return make_double2(abs(a.x), abs(a.y));
+  return make_double2(fabs(a.x), fabs(a.y));
 }


-template<> EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<float4,4>& kernel) {
  double tmp = kernel.packet[0].y;
  kernel.packet[0].y = kernel.packet[1].x;
@@ -279,7 +279,7 @@ ptranspose(PacketBlock<float4,4>& kernel) {
  kernel.packet[3].z = tmp;
 }

-template<> EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<double2,2>& kernel) {
  double tmp = kernel.packet[0].y;
  kernel.packet[0].y = kernel.packet[1].x;
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -48,7 +48,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
  };
 };

-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };

 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
@@ -114,7 +114,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<f

 template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
 {
-  Packet4f res;
+  Packet4f res = pset1<Packet4f>(0.f);
  res = vsetq_lane_f32(std::real(from[0*stride]), res, 0);
  res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1);
  res = vsetq_lane_f32(std::real(from[1*stride]), res, 2);
@@ -272,7 +272,7 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
 }

 //---------- double ----------
-#if EIGEN_ARCH_ARM64
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG

 static uint64x2_t p2ul_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x0, 0x8000000000000000);

@@ -306,7 +306,7 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
  };
 };

-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };

 template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
@@ -365,7 +365,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::c

 template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
 {
-  Packet2d res;
+  Packet2d res = pset1<Packet2d>(0.0);
  res = vsetq_lane_f64(std::real(from[0*stride]), res, 0);
  res = vsetq_lane_f64(std::imag(from[0*stride]), res, 1);
  return Packet1cd(res);
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -76,12 +76,12 @@ typedef uint32x4_t  Packet4ui;
 template<> struct packet_traits<float>  : default_packet_traits
 {
  typedef Packet4f type;
-  typedef Packet2f half;
+  typedef Packet4f half; // Packet2f intrinsics not implemented yet
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size = 4,
-    HasHalfPacket=1,
+    HasHalfPacket=0, // Packet2f intrinsics not implemented yet
   
    HasDiv  = 1,
    // FIXME check the Has*
@@ -95,12 +95,12 @@ template<> struct packet_traits<float>  : default_packet_traits
 template<> struct packet_traits<int>    : default_packet_traits
 {
  typedef Packet4i type;
-  typedef Packet2i half;
+  typedef Packet4i half; // Packet2i intrinsics not implemented yet
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size=4,
-    HasHalfPacket=1
+    HasHalfPacket=0 // Packet2i intrinsics not implemented yet
    // FIXME check the Has*
  };
 };
@@ -114,18 +114,18 @@ EIGEN_STRONG_INLINE void        vst1q_f32(float* to, float32x4_t from) { ::vst1q
 EIGEN_STRONG_INLINE void        vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
 #endif

-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };

 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return vdupq_n_f32(from); }
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   { return vdupq_n_s32(from); }

-template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a)
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
 {
  Packet4f countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
  return vaddq_f32(pset1<Packet4f>(a), countdown);
 }
-template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a)
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)
 {
  Packet4i countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
  return vaddq_s32(pset1<Packet4i>(a), countdown);
@@ -252,7 +252,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& f

 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
-  Packet4f res;
+  Packet4f res = pset1<Packet4f>(0.f);
  res = vsetq_lane_f32(from[0*stride], res, 0);
  res = vsetq_lane_f32(from[1*stride], res, 1);
  res = vsetq_lane_f32(from[2*stride], res, 2);
@@ -261,7 +261,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa
 }
 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
 {
-  Packet4i res;
+  Packet4i res = pset1<Packet4i>(0);
  res = vsetq_lane_s32(from[0*stride], res, 0);
  res = vsetq_lane_s32(from[1*stride], res, 1);
  res = vsetq_lane_s32(from[2*stride], res, 2);
@@ -309,6 +309,23 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
  a_hi = vget_high_s32(a_r64);
  return vcombine_s32(a_hi, a_lo);
 }
+
+template<size_t offset>
+struct protate_impl<offset, Packet4f>
+{
+  static Packet4f run(const Packet4f& a) {
+    return vextq_f32(a, a, offset);
+  }
+};
+
+template<size_t offset>
+struct protate_impl<offset, Packet4i>
+{
+  static Packet4i run(const Packet4i& a) {
+    return vextq_s32(a, a, offset);
+  }
+};
+
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }

@@ -501,7 +518,19 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
 }

 //---------- double ----------
-#if EIGEN_ARCH_ARM64
+
+// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double.
+// Confirmed at least with __apple_build_version__ = 6000054.
+#ifdef __apple_build_version__
+// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
+// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
+// major toolchain updates.
+#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000)
+#else
+#define EIGEN_APPLE_DOUBLE_NEON_BUG 0
+#endif
+
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG

 #if (EIGEN_COMP_GNUC_STRICT && defined(__ANDROID__)) || defined(__apple_build_version__)
 // Bug 907: workaround missing declarations of the following two functions in the ADK
@@ -524,12 +553,12 @@ typedef float64x1_t Packet1d;
 template<> struct packet_traits<double>  : default_packet_traits
 {
  typedef Packet2d type;
-  typedef Packet1d half;
+  typedef Packet2d half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size = 2,
-    HasHalfPacket=1,
+    HasHalfPacket=0,
   
    HasDiv  = 1,
    // FIXME check the Has*
@@ -541,11 +570,11 @@ template<> struct packet_traits<double>  : default_packet_traits
  };
 };

-template<> struct unpacket_traits<Packet2d> { typedef double  type; enum {size=2}; typedef Packet2d half; };
+template<> struct unpacket_traits<Packet2d> { typedef double  type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };

 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) { return vdupq_n_f64(from); }

-template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a)
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a)
 {
  Packet2d countdown = EIGEN_INIT_NEON_PACKET2(0, 1);
  return vaddq_f64(pset1<Packet2d>(a), countdown);
@@ -608,7 +637,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d&

 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
-  Packet2d res;
+  Packet2d res = pset1<Packet2d>(0.0);
  res = vsetq_lane_f64(from[0*stride], res, 0);
  res = vsetq_lane_f64(from[1*stride], res, 1);
  return res;
@@ -625,6 +654,14 @@ template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { retu

 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }

+template<size_t offset>
+struct protate_impl<offset, Packet2d>
+{
+  static Packet2d run(const Packet2d& a) {
+    return vextq_f64(a, a, offset);
+  }
+};
+
 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }

 #if EIGEN_COMP_CLANG && defined(__apple_build_version__)
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -50,7 +50,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 };
 #endif

-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };

 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
@@ -297,7 +297,7 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 };
 #endif

-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };

 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
@@ -474,7 +474,7 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
 }

 template<>  EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
-  __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
+  __m128d result = pblend<Packet2d>(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
  return Packet2cf(_mm_castpd_ps(result));
 }

--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -138,7 +138,6 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
 #ifdef EIGEN_VECTORIZE_SSE4_1
  fx = _mm_floor_ps(fx);
 #else
-  tmp = _mm_setzero_ps();
  emm0 = _mm_cvttps_epi32(fx);
  tmp  = _mm_cvtepi32_ps(emm0);
  /* if greater, substract 1 */
@@ -207,7 +206,6 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
 #ifdef EIGEN_VECTORIZE_SSE4_1
  fx = _mm_floor_pd(fx);
 #else
-  tmp = _mm_setzero_pd();
  emm0 = _mm_cvttpd_epi32(fx);
  tmp  = _mm_cvtepi32_pd(emm0);
  /* if greater, substract 1 */
@@ -464,11 +462,59 @@ Packet4f psqrt<Packet4f>(const Packet4f& _x)

 #else

-template<> EIGEN_STRONG_INLINE Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
+template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED 
+Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }

 #endif

-template<> EIGEN_STRONG_INLINE Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
+
+#if EIGEN_FAST_MATH
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& _x) {
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000);
+  _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000);
+
+  Packet4f neg_half = pmul(_x, p4f_minus_half);
+
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min);
+  Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x));
+
+  // Fill in NaNs and Infs for the negative/zero entries.
+  Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps());
+  Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask);
+  Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan),
+                                        _mm_and_ps(zero_mask, p4f_inf));
+
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five));
+
+  // Insert NaNs and Infs in all the right places.
+  return _mm_or_ps(x, infs_and_nans);
+}
+
+#else
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& x) {
+  // Unfortunately we can't use the much faster mm_rqsrt_ps since it only provides an approximation.
+  return _mm_div_ps(pset1<Packet4f>(1.0f), _mm_sqrt_ps(x));
+}
+
+#endif
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d prsqrt<Packet2d>(const Packet2d& x) {
+  // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation.
+  return _mm_div_pd(pset1<Packet2d>(1.0), _mm_sqrt_pd(x));
+}

 } // end namespace internal

--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -28,13 +28,12 @@ namespace internal {
 #endif
 #endif

-#if defined EIGEN_VECTORIZE_AVX && EIGEN_COMP_GNUC_STRICT
+#if (defined EIGEN_VECTORIZE_AVX) && EIGEN_COMP_GNUC_STRICT && (__GXX_ABI_VERSION < 1004)
 // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
 // have overloads for both types without linking error.
 // One solution is to increase ABI version using -fabi-version=4 (or greater).
-// To workaround this inconvenince, we rather wrap 128bit types into the following helper
+// Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper
 // structure:
-// TODO disable this wrapper if abi-versio>=4, but to detect that without asking the user to define a macro?
 template<typename T>
 struct eigen_packet_wrapper
 {
@@ -109,6 +108,7 @@ template<> struct packet_traits<float>  : default_packet_traits
    HasLog  = 1,
    HasExp  = 1,
    HasSqrt = 1,
+    HasRsqrt = 1,
    HasBlend = 1
  };
 };
@@ -125,6 +125,7 @@ template<> struct packet_traits<double> : default_packet_traits
    HasDiv  = 1,
    HasExp  = 1,
    HasSqrt = 1,
+    HasRsqrt = 1,
    HasBlend = 1
  };
 };
@@ -143,9 +144,9 @@ template<> struct packet_traits<int>    : default_packet_traits
  };
 };

-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };

 #if EIGEN_COMP_MSVC==1500
 // Workaround MSVC 9 internal compiler error.
@@ -171,11 +172,9 @@ template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
 }
 #endif
  
-#ifndef EIGEN_VECTORIZE_AVX
-template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
-template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
-#endif
-template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }

 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
@@ -463,6 +462,29 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
 { return _mm_shuffle_epi32(a,0x1B); }

+template<size_t offset>
+struct protate_impl<offset, Packet4f>
+{
+  static Packet4f run(const Packet4f& a) {
+    return vec4f_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
+  }
+};
+
+template<size_t offset>
+struct protate_impl<offset, Packet4i>
+{
+  static Packet4i run(const Packet4i& a) {
+    return vec4i_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
+  }
+};
+
+template<size_t offset>
+struct protate_impl<offset, Packet2d>
+{
+  static Packet2d run(const Packet2d& a) {
+    return vec2d_swizzle1(a, offset, (offset + 1) % 2);
+  }
+};

 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
 {
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -0,0 +1,77 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_SSE_H
+#define EIGEN_TYPE_CASTING_SSE_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <>
+struct type_casting_traits<float, int> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return _mm_cvttps_epi32(a);
+}
+
+
+template <>
+struct type_casting_traits<int, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return _mm_cvtepi32_ps(a);
+}
+
+
+template <>
+struct type_casting_traits<double, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 2,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
+  return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
+}
+
+template <>
+struct type_casting_traits<float, double> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 2
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
+  // Simply discard the second half of the input
+  return _mm_cvtps_pd(a);
+}
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_SSE_H
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -150,14 +150,6 @@ template<typename Scalar> struct swap_assign_op {
    swap(a,const_cast<Scalar&>(b));
 #endif
  }
-  
-  template<int LhsAlignment, int RhsAlignment, typename Packet>
-  EIGEN_STRONG_INLINE void swapPacket(Scalar* a, Scalar* b) const
-  {
-    Packet tmp = internal::ploadt<Packet,RhsAlignment>(b);
-    internal::pstoret<Scalar,Packet,RhsAlignment>(b, internal::ploadt<Packet,LhsAlignment>(a));
-    internal::pstoret<Scalar,Packet,LhsAlignment>(a, tmp);
-  }
 };
 template<typename Scalar>
 struct functor_traits<swap_assign_op<Scalar> > {
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -154,6 +154,48 @@ struct functor_traits<scalar_max_op<Scalar> > {
  };
 };

+/** \internal
+  * \brief Template functors for comparison of two scalars
+  * \todo Implement packet-comparisons
+  */
+template<typename Scalar, ComparisonName cmp> struct scalar_cmp_op;
+
+template<typename Scalar, ComparisonName cmp>
+struct functor_traits<scalar_cmp_op<Scalar, cmp> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = false
+  };
+};
+
+template<ComparisonName Cmp, typename Scalar>
+struct result_of<scalar_cmp_op<Scalar, Cmp>(Scalar,Scalar)> {
+  typedef bool type;
+};
+
+
+template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_EQ> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a==b;}
+};
+template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_LT> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a<b;}
+};
+template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_LE> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a<=b;}
+};
+template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_UNORD> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return !(a<=b || b<=a);}
+};
+template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_NEQ> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a!=b;}
+};
+
+
 /** \internal
  * \brief Template functor to compute the hypot of two scalars
  *
@@ -299,7 +341,6 @@ template<> struct functor_traits<scalar_boolean_or_op> {
 */
 template<typename Scalar>
 struct scalar_multiple_op {
-  typedef typename packet_traits<Scalar>::type Packet;
  // FIXME default copy constructors seems bugged with std::complex<>
  EIGEN_DEVICE_FUNC
  EIGEN_STRONG_INLINE scalar_multiple_op(const scalar_multiple_op& other) : m_other(other.m_other) { }
@@ -307,6 +348,7 @@ struct scalar_multiple_op {
  EIGEN_STRONG_INLINE scalar_multiple_op(const Scalar& other) : m_other(other) { }
  EIGEN_DEVICE_FUNC
  EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a * m_other; }
+  template <typename Packet>
  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
  { return internal::pmul(a, pset1<Packet>(m_other)); }
  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
@@ -337,11 +379,11 @@ struct functor_traits<scalar_multiple2_op<Scalar1,Scalar2> >
  */
 template<typename Scalar>
 struct scalar_quotient1_op {
-  typedef typename packet_traits<Scalar>::type Packet;
  // FIXME default copy constructors seems bugged with std::complex<>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient1_op(const scalar_quotient1_op& other) : m_other(other.m_other) { }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient1_op(const Scalar& other) : m_other(other) {}
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a / m_other; }
+  template <typename Packet>
  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
  { return internal::pdiv(a, pset1<Packet>(m_other)); }
  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
@@ -350,6 +392,18 @@ template<typename Scalar>
 struct functor_traits<scalar_quotient1_op<Scalar> >
 { enum { Cost = 2 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };

+template<typename Scalar1, typename Scalar2>
+struct scalar_quotient2_op {
+  typedef typename scalar_product_traits<Scalar1,Scalar2>::ReturnType result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient2_op(const scalar_quotient2_op& other) : m_other(other.m_other) { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient2_op(const Scalar2& other) : m_other(other) { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar1& a) const { return a / m_other; }
+  typename add_const_on_value_type<typename NumTraits<Scalar2>::Nested>::type m_other;
+};
+template<typename Scalar1,typename Scalar2>
+struct functor_traits<scalar_quotient2_op<Scalar1,Scalar2> >
+{ enum { Cost = 2 * NumTraits<Scalar1>::MulCost, PacketAccess = false }; };
+
 // In Eigen, any binary op (Product, CwiseBinaryOp) require the Lhs and Rhs to have the same scalar type, except for multiplication
 // where the mixing of different types is handled by scalar_product_traits
 // In particular, real * complex<real> is allowed.
@@ -367,11 +421,11 @@ template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<s
 /* If you wonder why doing the pset1() in packetOp() is an optimization check scalar_multiple_op */
 template<typename Scalar>
 struct scalar_add_op {
-  typedef typename packet_traits<Scalar>::type Packet;
  // FIXME default copy constructors seems bugged with std::complex<>
  EIGEN_DEVICE_FUNC inline scalar_add_op(const scalar_add_op& other) : m_other(other.m_other) { }
  EIGEN_DEVICE_FUNC inline scalar_add_op(const Scalar& other) : m_other(other) { }
  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a + m_other; }
+  template <typename Packet>
  inline const Packet packetOp(const Packet& a) const
  { return internal::padd(a, pset1<Packet>(m_other)); }
  const Scalar m_other;
@@ -386,10 +440,10 @@ struct functor_traits<scalar_add_op<Scalar> >
  */
 template<typename Scalar>
 struct scalar_sub_op {
-  typedef typename packet_traits<Scalar>::type Packet;
  inline scalar_sub_op(const scalar_sub_op& other) : m_other(other.m_other) { }
  inline scalar_sub_op(const Scalar& other) : m_other(other) { }
  inline Scalar operator() (const Scalar& a) const { return a - m_other; }
+  template <typename Packet>
  inline const Packet packetOp(const Packet& a) const
  { return internal::psub(a, pset1<Packet>(m_other)); }
  const Scalar m_other;
@@ -404,10 +458,10 @@ struct functor_traits<scalar_sub_op<Scalar> >
  */
 template<typename Scalar>
 struct scalar_rsub_op {
-  typedef typename packet_traits<Scalar>::type Packet;
  inline scalar_rsub_op(const scalar_rsub_op& other) : m_other(other.m_other) { }
  inline scalar_rsub_op(const Scalar& other) : m_other(other) { }
  inline Scalar operator() (const Scalar& a) const { return m_other - a; }
+  template <typename Packet>
  inline const Packet packetOp(const Packet& a) const
  { return internal::psub(pset1<Packet>(m_other), a); }
  const Scalar m_other;
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -16,13 +16,12 @@ namespace internal {

 template<typename Scalar>
 struct scalar_constant_op {
-  typedef typename packet_traits<Scalar>::type Packet;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const scalar_constant_op& other) : m_other(other.m_other) { }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const Scalar& other) : m_other(other) { }
  template<typename Index>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index, Index = 0) const { return m_other; }
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index, Index = 0) const { return internal::pset1<Packet>(m_other); }
+  template<typename Index, typename PacketType>
+  EIGEN_STRONG_INLINE const PacketType packetOp(Index, Index = 0) const { return internal::pset1<PacketType>(m_other); }
  const Scalar m_other;
 };
 template<typename Scalar>
@@ -39,7 +38,7 @@ template<typename Scalar>
 struct functor_traits<scalar_identity_op<Scalar> >
 { enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };

-template <typename Scalar, bool RandomAccess> struct linspaced_op_impl;
+template <typename Scalar, typename Packet, bool RandomAccess> struct linspaced_op_impl;

 // linear access for packet ops:
 // 1) initialization
@@ -49,15 +48,13 @@ template <typename Scalar, bool RandomAccess> struct linspaced_op_impl;
 //
 // TODO: Perhaps it's better to initialize lazily (so not in the constructor but in packetOp)
 //       in order to avoid the padd() in operator() ?
-template <typename Scalar>
-struct linspaced_op_impl<Scalar,false>
+template <typename Scalar, typename Packet>
+struct linspaced_op_impl<Scalar,Packet,false>
 {
-  typedef typename packet_traits<Scalar>::type Packet;
-
  linspaced_op_impl(const Scalar& low, const Scalar& step) :
  m_low(low), m_step(step),
-  m_packetStep(pset1<Packet>(packet_traits<Scalar>::size*step)),
-  m_base(padd(pset1<Packet>(low), pmul(pset1<Packet>(step),plset<Scalar>(-packet_traits<Scalar>::size)))) {}
+  m_packetStep(pset1<Packet>(unpacket_traits<Packet>::size*step)),
+  m_base(padd(pset1<Packet>(low), pmul(pset1<Packet>(step),plset<Packet>(-unpacket_traits<Packet>::size)))) {}

  template<typename Index>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const 
@@ -78,14 +75,12 @@ struct linspaced_op_impl<Scalar,false>
 // random access for packet ops:
 // 1) each step
 //   [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
-template <typename Scalar>
-struct linspaced_op_impl<Scalar,true>
+template <typename Scalar, typename Packet>
+struct linspaced_op_impl<Scalar,Packet,true>
 {
-  typedef typename packet_traits<Scalar>::type Packet;
-
  linspaced_op_impl(const Scalar& low, const Scalar& step) :
  m_low(low), m_step(step),
-  m_lowPacket(pset1<Packet>(m_low)), m_stepPacket(pset1<Packet>(m_step)), m_interPacket(plset<Scalar>(0)) {}
+  m_lowPacket(pset1<Packet>(m_low)), m_stepPacket(pset1<Packet>(m_step)), m_interPacket(plset<Packet>(0)) {}

  template<typename Index>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; }
@@ -106,12 +101,11 @@ struct linspaced_op_impl<Scalar,true>
 // Forward declaration (we default to random access which does not really give
 // us a speed gain when using packet access but it allows to use the functor in
 // nested expressions).
-template <typename Scalar, bool RandomAccess = true> struct linspaced_op;
-template <typename Scalar, bool RandomAccess> struct functor_traits< linspaced_op<Scalar,RandomAccess> >
+template <typename Scalar, typename PacketType, bool RandomAccess = true> struct linspaced_op;
+template <typename Scalar, typename PacketType, bool RandomAccess> struct functor_traits< linspaced_op<Scalar,PacketType,RandomAccess> >
 { enum { Cost = 1, PacketAccess = packet_traits<Scalar>::HasSetLinear, IsRepeatable = true }; };
-template <typename Scalar, bool RandomAccess> struct linspaced_op
+template <typename Scalar, typename PacketType, bool RandomAccess> struct linspaced_op
 {
-  typedef typename packet_traits<Scalar>::type Packet;
  linspaced_op(const Scalar& low, const Scalar& high, Index num_steps) : impl((num_steps==1 ? high : low), (num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1))) {}

  template<typename Index>
@@ -126,12 +120,12 @@ template <typename Scalar, bool RandomAccess> struct linspaced_op
    return impl(col + row);
  }

-  template<typename Index>
+  template<typename Index, typename Packet>
  EIGEN_STRONG_INLINE const Packet packetOp(Index i) const { return impl.packetOp(i); }

  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
  // there row==0 and col is used for the actual iteration.
-  template<typename Index>
+  template<typename Index, typename Packet>
  EIGEN_STRONG_INLINE const Packet packetOp(Index row, Index col) const
  {
    eigen_assert(col==0 || row==0);
@@ -141,7 +135,8 @@ template <typename Scalar, bool RandomAccess> struct linspaced_op
  // This proxy object handles the actual required temporaries, the different
  // implementations (random vs. sequential access) as well as the
  // correct piping to size 2/4 packet operations.
-  const linspaced_op_impl<Scalar,RandomAccess> impl;
+  // TODO find a way to make the packet type configurable
+  const linspaced_op_impl<Scalar,PacketType,RandomAccess> impl;
 };

 // all functors allow linear access, except scalar_identity_op. So we fix here a quick meta
--- a/Eigen/src/Core/functors/StlFunctors.h
+++ b/Eigen/src/Core/functors/StlFunctors.h
@@ -72,6 +72,8 @@ template<typename T>
 struct functor_traits<std::not_equal_to<T> >
 { enum { Cost = 1, PacketAccess = false }; };

+#if(__cplusplus < 201103L)
+// std::binder* are deprecated since c++11 and will be removed in c++17
 template<typename T>
 struct functor_traits<std::binder2nd<T> >
 { enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
@@ -79,6 +81,7 @@ struct functor_traits<std::binder2nd<T> >
 template<typename T>
 struct functor_traits<std::binder1st<T> >
 { enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
+#endif

 template<typename T>
 struct functor_traits<std::unary_negate<T> >
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -55,6 +55,34 @@ struct functor_traits<scalar_abs_op<Scalar> >
  };
 };

+/** \internal
+  * \brief Template functor to compute the score of a scalar, to chose a pivot
+  *
+  * \sa class CwiseUnaryOp
+  */
+template<typename Scalar> struct scalar_score_coeff_op : scalar_abs_op<Scalar>
+{
+  typedef void Score_is_abs;
+};
+template<typename Scalar>
+struct functor_traits<scalar_score_coeff_op<Scalar> > : functor_traits<scalar_abs_op<Scalar> > {};
+
+/* Avoid recomputing abs when we know the score and they are the same. Not a true Eigen functor.  */
+template<typename Scalar, typename=void> struct abs_knowing_score
+{
+  EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  template<typename Score>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a, const Score&) const { using std::abs; return abs(a); }
+};
+template<typename Scalar> struct abs_knowing_score<Scalar, typename scalar_score_coeff_op<Scalar>::Score_is_abs>
+{
+  EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  template<typename Scal>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scal&, const result_type& a) const { return a; }
+};
+
 /** \internal
  * \brief Template functor to compute the squared absolute value of a scalar
  *
@@ -94,6 +122,27 @@ struct functor_traits<scalar_conjugate_op<Scalar> >
  };
 };

+/** \internal
+  * \brief Template functor to compute the phase angle of a complex
+  *
+  * \sa class CwiseUnaryOp, Cwise::arg
+  */
+template<typename Scalar> struct scalar_arg_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_arg_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using numext::arg; return arg(a); }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::parg(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_arg_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::IsComplex ? 5 * NumTraits<Scalar>::MulCost : NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasArg
+  };
+};
 /** \internal
  * \brief Template functor to cast a scalar to another type
  *
@@ -182,7 +231,7 @@ struct functor_traits<scalar_imag_ref_op<Scalar> >
 template<typename Scalar> struct scalar_exp_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_exp_op)
  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::exp; return exp(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
+  template <typename Packet>
  inline Packet packetOp(const Packet& a) const { return internal::pexp(a); }
 };
 template<typename Scalar>
@@ -198,13 +247,28 @@ struct functor_traits<scalar_exp_op<Scalar> >
 template<typename Scalar> struct scalar_log_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_log_op)
  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::log; return log(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
+  template <typename Packet>
  inline Packet packetOp(const Packet& a) const { return internal::plog(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_log_op<Scalar> >
 { enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog }; };

+/** \internal
+  *
+  * \brief Template functor to compute the base-10 logarithm of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::log10()
+  */
+template<typename Scalar> struct scalar_log10_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_log10_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::log10; return log10(a); }
+  template <typename Packet>
+  inline Packet packetOp(const Packet& a) const { return internal::plog10(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_log10_op<Scalar> >
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog10 }; };

 /** \internal
  * \brief Template functor to compute the square root of a scalar
@@ -213,7 +277,7 @@ struct functor_traits<scalar_log_op<Scalar> >
 template<typename Scalar> struct scalar_sqrt_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op)
  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sqrt; return sqrt(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
+  template <typename Packet>
  inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); }
 };
 template<typename Scalar>
@@ -224,6 +288,25 @@ struct functor_traits<scalar_sqrt_op<Scalar> >
  };
 };

+/** \internal
+  * \brief Template functor to compute the reciprocal square root of a scalar
+  * \sa class CwiseUnaryOp, Cwise::rsqrt()
+  */
+template<typename Scalar> struct scalar_rsqrt_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_rsqrt_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sqrt; return Scalar(1)/sqrt(a); }
+  template <typename Packet>
+  inline Packet packetOp(const Packet& a) const { return internal::prsqrt(a); }
+};
+
+template<typename Scalar>
+struct functor_traits<scalar_rsqrt_op<Scalar> >
+{ enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasRsqrt
+  };
+};
+
 /** \internal
  * \brief Template functor to compute the cosine of a scalar
  * \sa class CwiseUnaryOp, ArrayBase::cos()
@@ -231,7 +314,7 @@ struct functor_traits<scalar_sqrt_op<Scalar> >
 template<typename Scalar> struct scalar_cos_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_cos_op)
  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { using std::cos; return cos(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
+  template <typename Packet>
  inline Packet packetOp(const Packet& a) const { return internal::pcos(a); }
 };
 template<typename Scalar>
@@ -250,7 +333,7 @@ struct functor_traits<scalar_cos_op<Scalar> >
 template<typename Scalar> struct scalar_sin_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_sin_op)
  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sin; return sin(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
+  template <typename Packet>
  inline Packet packetOp(const Packet& a) const { return internal::psin(a); }
 };
 template<typename Scalar>
@@ -270,7 +353,7 @@ struct functor_traits<scalar_sin_op<Scalar> >
 template<typename Scalar> struct scalar_tan_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_tan_op)
  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::tan; return tan(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
+  template <typename Packet>
  inline Packet packetOp(const Packet& a) const { return internal::ptan(a); }
 };
 template<typename Scalar>
@@ -289,7 +372,7 @@ struct functor_traits<scalar_tan_op<Scalar> >
 template<typename Scalar> struct scalar_acos_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_acos_op)
  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::acos; return acos(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
+  template <typename Packet>
  inline Packet packetOp(const Packet& a) const { return internal::pacos(a); }
 };
 template<typename Scalar>
@@ -308,7 +391,7 @@ struct functor_traits<scalar_acos_op<Scalar> >
 template<typename Scalar> struct scalar_asin_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_asin_op)
  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::asin; return asin(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
+  template <typename Packet>
  inline Packet packetOp(const Packet& a) const { return internal::pasin(a); }
 };
 template<typename Scalar>
@@ -320,7 +403,6 @@ struct functor_traits<scalar_asin_op<Scalar> >
  };
 };

-
 /** \internal
  * \brief Template functor to compute the atan of a scalar
  * \sa class CwiseUnaryOp, ArrayBase::atan()
@@ -328,7 +410,7 @@ struct functor_traits<scalar_asin_op<Scalar> >
 template<typename Scalar> struct scalar_atan_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_atan_op)
  inline const Scalar operator() (const Scalar& a) const { using std::atan; return atan(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
+  template <typename Packet>
  inline Packet packetOp(const Packet& a) const { return internal::patan(a); }
 };
 template<typename Scalar>
@@ -340,6 +422,63 @@ struct functor_traits<scalar_atan_op<Scalar> >
  };
 };

+/** \internal
+  * \brief Template functor to compute the tanh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::tanh()
+  */
+template<typename Scalar> struct scalar_tanh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_op)
+  inline const Scalar operator() (const Scalar& a) const { using std::tanh; return tanh(a); }
+  template <typename Packet>
+  inline Packet packetOp(const Packet& a) const { return internal::ptanh(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_tanh_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasTanh
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the sinh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::sinh()
+  */
+template<typename Scalar> struct scalar_sinh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sinh_op)
+  inline const Scalar operator() (const Scalar& a) const { using std::sinh; return sinh(a); }
+  template <typename Packet>
+  inline Packet packetOp(const Packet& a) const { return internal::psinh(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_sinh_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasSinh
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the cosh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::cosh()
+  */
+template<typename Scalar> struct scalar_cosh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cosh_op)
+  inline const Scalar operator() (const Scalar& a) const { using std::cosh; return cosh(a); }
+  template <typename Packet>
+  inline Packet packetOp(const Packet& a) const { return internal::pcosh(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_cosh_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasCosh
+  };
+};
+
 /** \internal
  * \brief Template functor to compute the inverse of a scalar
  * \sa class CwiseUnaryOp, Cwise::inverse()
@@ -388,6 +527,134 @@ template<typename Scalar>
 struct functor_traits<scalar_cube_op<Scalar> >
 { enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };

+/** \internal
+  * \brief Template functor to compute the rounded value of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::round()
+  */
+template<typename Scalar> struct scalar_round_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_round_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::round(a); }
+  template <typename Packet>
+  inline Packet packetOp(const Packet& a) const { return internal::pround(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_round_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasRound
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the floor of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::floor()
+  */
+template<typename Scalar> struct scalar_floor_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_floor_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::floor(a); }
+  template <typename Packet>
+  inline Packet packetOp(const Packet& a) const { return internal::pfloor(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_floor_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasFloor
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the ceil of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::ceil()
+  */
+template<typename Scalar> struct scalar_ceil_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_ceil_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::ceil(a); }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::pceil(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_ceil_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasCeil
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute whether a scalar is NaN
+  * \sa class CwiseUnaryOp, ArrayBase::isnan()
+  */
+template<typename Scalar> struct scalar_isnan_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_isnan_op)
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isnan)(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_isnan_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+  * \brief Template functor to check whether a scalar is +/-inf
+  * \sa class CwiseUnaryOp, ArrayBase::isinf()
+  */
+template<typename Scalar> struct scalar_isinf_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_isinf_op)
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isinf)(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_isinf_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+  * \brief Template functor to check whether a scalar has a finite value
+  * \sa class CwiseUnaryOp, ArrayBase::isfinite()
+  */
+template<typename Scalar> struct scalar_isfinite_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_isfinite_op)
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isfinite)(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_isfinite_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the logical not of a boolean
+  *
+  * \sa class CwiseUnaryOp, ArrayBase::operator!
+  */
+template<typename Scalar> struct scalar_boolean_not_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_not_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a) const { return !a; }
+};
+template<typename Scalar>
+struct functor_traits<scalar_boolean_not_op<Scalar> > {
+  enum {
+    Cost = NumTraits<bool>::AddCost,
+    PacketAccess = false
+  };
+};
+

 } // end namespace internal

--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -25,21 +25,31 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff
  return a<=0 ? b : a;
 }

+#if EIGEN_ARCH_i386_OR_x86_64
+const std::ptrdiff_t defaultL1CacheSize = 32*1024;
+const std::ptrdiff_t defaultL2CacheSize = 256*1024;
+const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
+#else
+const std::ptrdiff_t defaultL1CacheSize = 16*1024;
+const std::ptrdiff_t defaultL2CacheSize = 512*1024;
+const std::ptrdiff_t defaultL3CacheSize = 512*1024;
+#endif
+
 /** \internal */
 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
 {
  static bool m_cache_sizes_initialized = false;
-  static std::ptrdiff_t m_l1CacheSize = 32*1024;
-  static std::ptrdiff_t m_l2CacheSize = 256*1024;
-  static std::ptrdiff_t m_l3CacheSize = 2*1024*1024;
+  static std::ptrdiff_t m_l1CacheSize = 0;
+  static std::ptrdiff_t m_l2CacheSize = 0;
+  static std::ptrdiff_t m_l3CacheSize = 0;

  if(!m_cache_sizes_initialized)
  {
    int l1CacheSize, l2CacheSize, l3CacheSize;
    queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
-    m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, 8*1024);
-    m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, 256*1024);
-    m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, 8*1024*1024);
+    m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
+    m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
+    m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
    m_cache_sizes_initialized = true;
  }

@@ -64,6 +74,211 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff
  }
 }

+/* Helper for computeProductBlockingSizes.
+ *
+ * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
+ * this function computes the blocking size parameters along the respective dimensions
+ * for matrix products and related algorithms. The blocking sizes depends on various
+ * parameters:
+ * - the L1 and L2 cache sizes,
+ * - the register level blocking sizes defined by gebp_traits,
+ * - the number of scalars that fit into a packet (when vectorization is enabled).
+ *
+ * \sa setCpuCacheSizes */
+
+template<typename LhsScalar, typename RhsScalar, int KcFactor>
+void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
+{
+  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
+  // Explanations:
+  // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
+  // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
+  // per mr x kc horizontal small panels where mr is the blocking size along the m dimension
+  // at the register level. This small horizontal panel has to stay within L1 cache.
+  std::ptrdiff_t l1, l2, l3;
+  manage_caching_sizes(GetAction, &l1, &l2, &l3);
+
+  if (num_threads > 1) {
+    typedef typename Traits::ResScalar ResScalar;
+    enum {
+      kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+      ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
+      k_mask = -8,
+
+      mr = Traits::mr,
+      mr_mask = -mr,
+
+      nr = Traits::nr,
+      nr_mask = -nr
+    };
+    // Increasing k gives us more time to prefetch the content of the "C"
+    // registers. However once the latency is hidden there is no point in
+    // increasing the value of k, so we'll cap it at 320 (value determined
+    // experimentally).
+    const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320);
+    if (k_cache < k) {
+      k = k_cache & k_mask;
+      eigen_internal_assert(k > 0);
+    }
+
+    const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
+    const Index n_per_thread = numext::div_ceil(n, num_threads);
+    if (n_cache <= n_per_thread) {
+      // Don't exceed the capacity of the l2 cache.
+      eigen_internal_assert(n_cache >= static_cast<Index>(nr));
+      n = n_cache & nr_mask;
+      eigen_internal_assert(n > 0);
+    } else {
+      n = (std::min<Index>)(n, (n_per_thread + nr - 1) & nr_mask);
+    }
+
+    if (l3 > l2) {
+      // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
+      const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
+      const Index m_per_thread = numext::div_ceil(m, num_threads);
+      if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
+        m = m_cache & mr_mask;
+        eigen_internal_assert(m > 0);
+      } else {
+        m = (std::min<Index>)(m, (m_per_thread + mr - 1) & mr_mask);
+      }
+    }
+  }
+  else {
+    // In unit tests we do not want to use extra large matrices,
+    // so we reduce the cache size to check the blocking strategy is not flawed
+#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
+    l1 = 9*1024;
+    l2 = 32*1024;
+    l3 = 512*1024;
+#endif
+    
+    // Early return for small problems because the computation below are time consuming for small problems.
+    // Perhaps it would make more sense to consider k*n*m??
+    // Note that for very tiny problem, this function should be bypassed anyway
+    // because we use the coefficient-based implementation for them.
+    if((std::max)(k,(std::max)(m,n))<48)
+      return;
+    
+    typedef typename Traits::ResScalar ResScalar;
+    enum {
+      k_peeling = 8,
+      k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+      k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
+    };
+    
+    // ---- 1st level of blocking on L1, yields kc ----
+    
+    // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
+    // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
+    // We also include a register-level block of the result (mx x nr).
+    // (In an ideal world only the lhs panel would stay in L1)
+    // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
+    const Index max_kc = ((l1-k_sub)/k_div) & (~(k_peeling-1));
+    const Index old_k = k;
+    if(k>max_kc)
+    {
+      // We are really blocking on the third dimension:
+      // -> reduce blocking size to make sure the last block is as large as possible
+      //    while keeping the same number of sweeps over the result.
+      k = (k%max_kc)==0 ? max_kc
+                        : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
+                        
+      eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
+    }
+    
+    // ---- 2nd level of blocking on max(L2,L3), yields nc ----
+    
+    // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
+    //      actual_l2 = max(l2, l3/nb_core_sharing_l3)
+    // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
+    // For instance, it corresponds to 6MB of L3 shared among 4 cores.
+    #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
+    const Index actual_l2 = l3;
+    #else
+    const Index actual_l2 = 1572864; // == 1.5 MB
+    #endif
+    
+    
+    
+    // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
+    // The second half is implicitly reserved to access the result and lhs coefficients.
+    // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
+    // to limit this growth: we bound nc to growth by a factor x1.5.
+    // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
+    // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
+    Index max_nc;
+    const Index lhs_bytes = m * k * sizeof(LhsScalar);
+    const Index remaining_l1 = l1- k_sub - lhs_bytes;
+    if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
+    {
+      // L1 blocking
+      max_nc = remaining_l1 / (k*sizeof(RhsScalar));
+    }
+    else
+    {
+      // L2 blocking
+      max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
+    }
+    // WARNING Below, we assume that Traits::nr is a power of two.
+    Index nc = std::min<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
+    if(n>nc)
+    {
+      // We are really blocking over the columns:
+      // -> reduce blocking size to make sure the last block is as large as possible
+      //    while keeping the same number of sweeps over the packed lhs.
+      //    Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
+      n = (n%nc)==0 ? nc
+                    : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
+    }
+    else if(old_k==k)
+    {
+      // So far, no blocking at all, i.e., kc==k, and nc==n.
+      // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
+      // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
+      Index problem_size = k*n*sizeof(LhsScalar);
+      Index actual_lm = actual_l2;
+      Index max_mc = m;
+      if(problem_size<=1024)
+      {
+        // problem is small enough to keep in L1
+        // Let's choose m such that lhs's block fit in 1/3 of L1
+        actual_lm = l1;
+      }
+      else if(l3!=0 && problem_size<=32768)
+      {
+        // we have both L2 and L3, and problem is small enough to be kept in L2
+        // Let's choose m such that lhs's block fit in 1/3 of L2
+        actual_lm = l2;
+        max_mc = 576;
+      }
+      Index mc = (std::min<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
+      if (mc > Traits::mr) mc -= mc % Traits::mr;
+      else if (mc==0) return;
+      m = (m%mc)==0 ? mc
+                    : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
+    }
+  }
+}
+
+inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
+{
+#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
+  if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
+    k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
+    m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
+    n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
+    return true;
+  }
+#else
+  EIGEN_UNUSED_VARIABLE(k)
+  EIGEN_UNUSED_VARIABLE(m)
+  EIGEN_UNUSED_VARIABLE(n)
+#endif
+  return false;
+}
+
 /** \brief Computes the blocking parameters for a m x k times k x n matrix product
  *
  * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
@@ -72,88 +287,34 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff
  *
  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
  * this function computes the blocking size parameters along the respective dimensions
-  * for matrix products and related algorithms. The blocking sizes depends on various
-  * parameters:
-  * - the L1 and L2 cache sizes,
-  * - the register level blocking sizes defined by gebp_traits,
-  * - the number of scalars that fit into a packet (when vectorization is enabled).
+  * for matrix products and related algorithms.
+  *
+  * The blocking size parameters may be evaluated:
+  *   - either by a heuristic based on cache sizes;
+  *   - or using fixed prescribed values (for testing purposes).
  *
  * \sa setCpuCacheSizes */
-#define CEIL(a, b) ((a)+(b)-1)/(b)

-template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType>
-void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads)
+template<typename LhsScalar, typename RhsScalar, int KcFactor>
+void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
 {
-  // Explanations:
-  // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
-  // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
-  // per kc x nr vertical small panels where nr is the blocking size along the n dimension
-  // at the register level. For vectorization purpose, these small vertical panels are unpacked,
-  // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
-  // stay in L1 cache.
-  std::ptrdiff_t l1, l2, l3;
-  manage_caching_sizes(GetAction, &l1, &l2, &l3);
-
-  if (num_threads > 1) {
-    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
-    typedef typename Traits::ResScalar ResScalar;
-    enum {
-      kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
-      ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
-      k_mask = (0xffffffff/8)*8,
-
-      mr = Traits::mr,
-      mr_mask = (0xffffffff/mr)*mr,
-
-      nr = Traits::nr,
-      nr_mask = (0xffffffff/nr)*nr
-    };
-    SizeType k_cache = (l1-ksub)/kdiv;
-    if (k_cache < k) {
-      k = k_cache & k_mask;
-      eigen_assert(k > 0);
-    }
-
-    SizeType n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
-    SizeType n_per_thread = CEIL(n, num_threads);
-    if (n_cache <= n_per_thread) {
-      // Don't exceed the capacity of the l2 cache.
-      eigen_assert(n_cache >= static_cast<SizeType>(nr));
-      n = n_cache & nr_mask;
-      eigen_assert(n > 0);
-    } else {
-      n = (std::min<SizeType>)(n, (n_per_thread + nr - 1) & nr_mask);
-    }
-
-    if (l3 > l2) {
-      // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
-      SizeType m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
-      SizeType m_per_thread = CEIL(m, num_threads);
-      if(m_cache < m_per_thread && m_cache >= static_cast<SizeType>(mr)) {
-        m = m_cache & mr_mask;
-        eigen_assert(m > 0);
-      } else {
-        m = (std::min<SizeType>)(m, (m_per_thread + mr - 1) & mr_mask);
-      }
-    }
-  }
-  else {
-    // In unit tests we do not want to use extra large matrices,
-    // so we reduce the block size to check the blocking strategy is not flawed
-#ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
-    k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240);
-    n = std::min<SizeType>(n,3840/sizeof(RhsScalar));
-    m = std::min<SizeType>(m,3840/sizeof(RhsScalar));
-#else
-    k = std::min<SizeType>(k,24);
-    n = std::min<SizeType>(n,384/sizeof(RhsScalar));
-    m = std::min<SizeType>(m,384/sizeof(RhsScalar));
-#endif
+  if (!useSpecificBlockingSizes(k, m, n)) {
+    evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor>(k, m, n, num_threads);
  }
+
+  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+  enum {
+    kr = 8,
+    mr = Traits::mr,
+    nr = Traits::nr
+  };
+  if (k > kr) k -= k % kr;
+  if (m > mr) m -= m % mr;
+  if (n > nr) n -= n % nr;
 }

-template<typename LhsScalar, typename RhsScalar, typename SizeType>
-inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads)
+template<typename LhsScalar, typename RhsScalar>
+inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
 {
  computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads);
 }
@@ -220,11 +381,14 @@ public:
    nr = 4,

    // register block size along the M direction (currently, this one cannot be modified)
+    default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
    // we assume 16 registers
-    mr = 3*LhsPacketSize,
+    // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
+    // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
+    mr = Vectorizable ? 3*LhsPacketSize : default_mr,
 #else
-    mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
+    mr = default_mr,
 #endif
    
    LhsProgress = LhsPacketSize,
@@ -698,6 +862,80 @@ protected:
  conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
 };

+// helper for the rotating kernel below
+template <typename GebpKernel, bool UseRotatingKernel = GebpKernel::UseRotatingKernel>
+struct PossiblyRotatingKernelHelper
+{
+  // default implementation, not rotating
+
+  typedef typename GebpKernel::Traits Traits;
+  typedef typename Traits::RhsScalar RhsScalar;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::AccPacket AccPacket;
+
+  const Traits& traits;
+  PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
+
+
+  template <size_t K, size_t Index>
+  void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
+  {
+    traits.loadRhs(from + (Index+4*K)*Traits::RhsProgress, to);
+  }
+
+  void unrotateResult(AccPacket&,
+                      AccPacket&,
+                      AccPacket&,
+                      AccPacket&)
+  {
+  }
+};
+
+// rotating implementation
+template <typename GebpKernel>
+struct PossiblyRotatingKernelHelper<GebpKernel, true>
+{
+  typedef typename GebpKernel::Traits Traits;
+  typedef typename Traits::RhsScalar RhsScalar;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::AccPacket AccPacket;
+
+  const Traits& traits;
+  PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
+
+  template <size_t K, size_t Index>
+  void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
+  {
+    if (Index == 0) {
+      to = pload<RhsPacket>(from + 4*K*Traits::RhsProgress);
+    } else {
+      EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers");
+      to = protate<1>(to);
+    }
+  }
+
+  void unrotateResult(AccPacket& res0,
+                      AccPacket& res1,
+                      AccPacket& res2,
+                      AccPacket& res3)
+  {
+    PacketBlock<AccPacket> resblock;
+    resblock.packet[0] = res0;
+    resblock.packet[1] = res1;
+    resblock.packet[2] = res2;
+    resblock.packet[3] = res3;
+    ptranspose(resblock);
+    resblock.packet[3] = protate<1>(resblock.packet[3]);
+    resblock.packet[2] = protate<2>(resblock.packet[2]);
+    resblock.packet[1] = protate<3>(resblock.packet[1]);
+    ptranspose(resblock);
+    res0 = resblock.packet[0];
+    res1 = resblock.packet[1];
+    res2 = resblock.packet[2];
+    res3 = resblock.packet[3];
+  }
+};
+
 /* optimized GEneral packed Block * packed Panel product kernel
 *
 * Mixing type logic: C += A * B
@@ -731,6 +969,16 @@ struct gebp_kernel
    ResPacketSize = Traits::ResPacketSize
  };

+
+  static const bool UseRotatingKernel =
+    EIGEN_ARCH_ARM &&
+    internal::is_same<LhsScalar, float>::value &&
+    internal::is_same<RhsScalar, float>::value &&
+    internal::is_same<ResScalar, float>::value &&
+    Traits::LhsPacketSize == 4 &&
+    Traits::RhsPacketSize == 4 &&
+    Traits::ResPacketSize == 4;
+
  EIGEN_DONT_INLINE
  void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
                  Index rows, Index depth, Index cols, ResScalar alpha,
@@ -758,22 +1006,36 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
    const Index peeled_kc  = depth & ~(pk-1);
    const Index prefetch_res_offset = 32/sizeof(ResScalar);    
 //     const Index depth2     = depth & ~1;
-    
+
    //---------- Process 3 * LhsProgress rows at once ----------
    // This corresponds to 3*LhsProgress x nr register blocks.
    // Usually, make sense only with FMA
    if(mr>=3*Traits::LhsProgress)
-    {
-      // loops on each largest micro horizontal panel of lhs (3*Traits::LhsProgress x depth)
-      for(Index i=0; i<peeled_mc3; i+=3*Traits::LhsProgress)
+    {      
+      PossiblyRotatingKernelHelper<gebp_kernel> possiblyRotatingKernelHelper(traits);
+      
+      // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
+      // and on each largest micro vertical panel of the rhs (depth * nr).
+      // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
+      // However, if depth is too small, we can extend the number of rows of these horizontal panels.
+      // This actual number of rows is computed as follow:
+      const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
+      // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
+      // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
+      // or because we are testing specific blocking sizes.
+      const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
+      for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
      {
-        // loops on each largest micro vertical panel of rhs (depth * nr)
+        const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
        for(Index j2=0; j2<packet_cols4; j2+=nr)
        {
-          // We select a 3*Traits::LhsProgress x nr micro block of res which is entirely
+          for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
+          {
+          
+          // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
          // stored into 3 x nr registers.
          
-          const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
          prefetch(&blA[0]);

          // gets res block as register
@@ -798,50 +1060,50 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
          prefetch(&blB[0]);
          LhsPacket A0, A1;
-          
+
          for(Index k=0; k<peeled_kc; k+=pk)
          {
            EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
            RhsPacket B_0, T0;
            LhsPacket A2;

-#define EIGEN_GEBGP_ONESTEP(K) \
+#define EIGEN_GEBP_ONESTEP(K) \
            do { \
              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
              internal::prefetch(blA+(3*K+16)*LhsProgress); \
-              internal::prefetch(blB+(4*K+16)*RhsProgress); /* Bug 953 */ \
+              if (EIGEN_ARCH_ARM) internal::prefetch(blB+(4*K+16)*RhsProgress); /* Bug 953 */ \
              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
-              traits.loadRhs(&blB[(0+4*K)*RhsProgress], B_0); \
+              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 0>(B_0, blB); \
              traits.madd(A0, B_0, C0, T0); \
              traits.madd(A1, B_0, C4, T0); \
              traits.madd(A2, B_0, C8, B_0); \
-              traits.loadRhs(&blB[1+4*K*RhsProgress], B_0); \
+              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 1>(B_0, blB); \
              traits.madd(A0, B_0, C1, T0); \
              traits.madd(A1, B_0, C5, T0); \
              traits.madd(A2, B_0, C9, B_0); \
-              traits.loadRhs(&blB[2+4*K*RhsProgress], B_0); \
+              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 2>(B_0, blB); \
              traits.madd(A0, B_0, C2,  T0); \
              traits.madd(A1, B_0, C6,  T0); \
              traits.madd(A2, B_0, C10, B_0); \
-              traits.loadRhs(&blB[3+4*K*RhsProgress], B_0); \
+              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 3>(B_0, blB); \
              traits.madd(A0, B_0, C3 , T0); \
              traits.madd(A1, B_0, C7,  T0); \
              traits.madd(A2, B_0, C11, B_0); \
              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
            } while(false)
-        
-            internal::prefetch(blB + 4 * pk * sizeof(RhsScalar)); /* Bug 953 */
-            EIGEN_GEBGP_ONESTEP(0);
-            EIGEN_GEBGP_ONESTEP(1);
-            EIGEN_GEBGP_ONESTEP(2);
-            EIGEN_GEBGP_ONESTEP(3);
-            EIGEN_GEBGP_ONESTEP(4);
-            EIGEN_GEBGP_ONESTEP(5);
-            EIGEN_GEBGP_ONESTEP(6);
-            EIGEN_GEBGP_ONESTEP(7);
+
+            internal::prefetch(blB);
+            EIGEN_GEBP_ONESTEP(0);
+            EIGEN_GEBP_ONESTEP(1);
+            EIGEN_GEBP_ONESTEP(2);
+            EIGEN_GEBP_ONESTEP(3);
+            EIGEN_GEBP_ONESTEP(4);
+            EIGEN_GEBP_ONESTEP(5);
+            EIGEN_GEBP_ONESTEP(6);
+            EIGEN_GEBP_ONESTEP(7);

            blB += pk*4*RhsProgress;
            blA += pk*3*Traits::LhsProgress;
@@ -853,12 +1115,17 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
          {
            RhsPacket B_0, T0;
            LhsPacket A2;
-            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBP_ONESTEP(0);
            blB += 4*RhsProgress;
            blA += 3*Traits::LhsProgress;
          }
-  #undef EIGEN_GEBGP_ONESTEP
-  
+
+#undef EIGEN_GEBP_ONESTEP
+
+          possiblyRotatingKernelHelper.unrotateResult(C0, C1, C2, C3);
+          possiblyRotatingKernelHelper.unrotateResult(C4, C5, C6, C7);
+          possiblyRotatingKernelHelper.unrotateResult(C8, C9, C10, C11);
+
          ResPacket R0, R1, R2;
          ResPacket alphav = pset1<ResPacket>(alpha);

@@ -900,12 +1167,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
          traits.acc(C11, alphav, R2);
          r3.storePacket(0 * Traits::ResPacketSize, R0);
          r3.storePacket(1 * Traits::ResPacketSize, R1);
-          r3.storePacket(2 * Traits::ResPacketSize, R2);
+          r3.storePacket(2 * Traits::ResPacketSize, R2);          
+          }
        }

        // Deal with remaining columns of the rhs
        for(Index j2=packet_cols4; j2<cols; j2++)
        {
+          for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
+          {
          // One column at a time
          const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
          prefetch(&blA[0]);
@@ -976,7 +1246,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
          traits.acc(C8, alphav, R2);
          r0.storePacket(0 * Traits::ResPacketSize, R0);
          r0.storePacket(1 * Traits::ResPacketSize, R1);
-          r0.storePacket(2 * Traits::ResPacketSize, R2);
+          r0.storePacket(2 * Traits::ResPacketSize, R2);          
+          }
        }
      }
    }
@@ -984,13 +1255,21 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
    //---------- Process 2 * LhsProgress rows at once ----------
    if(mr>=2*Traits::LhsProgress)
    {
-      // loops on each largest micro horizontal panel of lhs (2*LhsProgress x depth)
-      for(Index i=peeled_mc3; i<peeled_mc2; i+=2*LhsProgress)
+      const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
+      // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
+      // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
+      // or because we are testing specific blocking sizes.
+      Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
+
+      for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
      {
-        // loops on each largest micro vertical panel of rhs (depth * nr)
+        Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
        for(Index j2=0; j2<packet_cols4; j2+=nr)
        {
-          // We select a 2*Traits::LhsProgress x nr micro block of res which is entirely
+          for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
+          {
+          
+          // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
          // stored into 2 x nr registers.
          
          const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
@@ -1094,11 +1373,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
          r2.storePacket(1 * Traits::ResPacketSize, R1);
          r3.storePacket(0 * Traits::ResPacketSize, R2);
          r3.storePacket(1 * Traits::ResPacketSize, R3);
+          }
        }
-
+      
        // Deal with remaining columns of the rhs
        for(Index j2=packet_cols4; j2<cols; j2++)
        {
+          for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
+          {
          // One column at a time
          const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
          prefetch(&blA[0]);
@@ -1165,6 +1447,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
          traits.acc(C4, alphav, R1);
          r0.storePacket(0 * Traits::ResPacketSize, R0);
          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          }
        }
      }
    }
@@ -1286,17 +1569,17 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga

          for(Index k=0; k<peeled_kc; k+=pk)
          {
-            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
            RhsPacket B_0;
        
 #define EIGEN_GEBGP_ONESTEP(K) \
            do {                                                                \
-              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");        \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1");        \
              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
              traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                     \
              traits.madd(A0, B_0, C0, B_0);                                    \
-              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");          \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1");          \
            } while(false);

            EIGEN_GEBGP_ONESTEP(0);
@@ -1311,7 +1594,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
            blB += pk*RhsProgress;
            blA += pk*1*Traits::LhsProgress;

-            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
+            EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
          }

          // process remaining peeled loop
@@ -1783,19 +2066,19 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
      const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);

      Index k=0;
-      if((PacketSize%4)==0) // TODO enbale vectorized transposition for PacketSize==2 ??
+      if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ??
      {
        for(; k<peeled_k; k+=PacketSize) {
          PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
          kernel.packet[0] = dm0.loadPacket(k);
-          kernel.packet[1] = dm1.loadPacket(k);
-          kernel.packet[2] = dm2.loadPacket(k);
-          kernel.packet[3] = dm3.loadPacket(k);
+          kernel.packet[1%PacketSize] = dm1.loadPacket(k);
+          kernel.packet[2%PacketSize] = dm2.loadPacket(k);
+          kernel.packet[3%PacketSize] = dm3.loadPacket(k);
          ptranspose(kernel);
          pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
-          pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1]));
-          pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2]));
-          pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3]));
+          pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
+          pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
+          pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
          count+=4*PacketSize;
        }
      }
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -164,6 +164,8 @@ static void run(Index rows, Index cols, Index depth,

    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
+    
+    const bool pack_rhs_once = mc!=rows && kc==depth && nc==cols;

    // For each horizontal panel of the rhs, and corresponding panel of the lhs...
    for(Index i2=0; i2<rows; i2+=mc)
@@ -188,7 +190,8 @@ static void run(Index rows, Index cols, Index depth,
          // We pack the rhs's block into a sequential chunk of memory (L2 caching)
          // Note that this block will be read a very high number of times, which is equal to the number of
          // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
-          pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
+          if((!pack_rhs_once) || i2==0)
+            pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
          
          // Everything is packed, we can now call the panel * block kernel:
          gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
@@ -201,15 +204,10 @@ static void run(Index rows, Index cols, Index depth,
 };

 /*********************************************************************************
-*  Specialization of GeneralProduct<> for "large" GEMM, i.e.,
+*  Specialization of generic_product_impl for "large" GEMM, i.e.,
 *  implementation of the high level wrapper to general_matrix_matrix_product
 **********************************************************************************/

-template<typename Lhs, typename Rhs>
-struct traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
- : traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> >
-{};
-
 template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType>
 struct gemm_functor
 {
@@ -217,8 +215,9 @@ struct gemm_functor
    : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking)
  {}

-  void initParallelSession() const
+  void initParallelSession(Index num_threads) const
  {
+    m_blocking.initParallel(m_lhs.rows(), m_rhs.cols(), m_lhs.cols(), num_threads);
    m_blocking.allocateA();
  }

@@ -276,7 +275,7 @@ class level3_blocking
 };

 template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
-class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true>
+class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true /* == FiniteAtCompileTime */>
  : public level3_blocking<
      typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type,
      typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type>
@@ -294,19 +293,32 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
      SizeB = ActualCols * MaxDepth
    };

-    EIGEN_ALIGN_DEFAULT LhsScalar m_staticA[SizeA];
-    EIGEN_ALIGN_DEFAULT RhsScalar m_staticB[SizeB];
+#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES
+    EIGEN_ALIGN_MAX LhsScalar m_staticA[SizeA];
+    EIGEN_ALIGN_MAX RhsScalar m_staticB[SizeB];
+#else
+    EIGEN_ALIGN_MAX char m_staticA[SizeA * sizeof(LhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES-1];
+    EIGEN_ALIGN_MAX char m_staticB[SizeB * sizeof(RhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES-1];
+#endif

  public:

-    gemm_blocking_space(Index /*rows*/, Index /*cols*/, Index /*depth*/, int /*num_threads*/, bool /*full_rows = false*/)
+    gemm_blocking_space(Index /*rows*/, Index /*cols*/, Index /*depth*/, Index /*num_threads*/, bool /*full_rows = false*/)
    {
      this->m_mc = ActualRows;
      this->m_nc = ActualCols;
      this->m_kc = MaxDepth;
+#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES
      this->m_blockA = m_staticA;
      this->m_blockB = m_staticB;
+#else
+      this->m_blockA = reinterpret_cast<LhsScalar*>((std::size_t(m_staticA) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
+      this->m_blockB = reinterpret_cast<RhsScalar*>((std::size_t(m_staticB) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
+#endif
    }
+    
+    void initParallel(Index, Index, Index, Index)
+    {}

    inline void allocateA() {}
    inline void allocateB() {}
@@ -331,7 +343,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M

  public:

-    gemm_blocking_space(Index rows, Index cols, Index depth, int num_threads, bool l3_blocking)
+    gemm_blocking_space(Index rows, Index cols, Index depth, Index num_threads, bool l3_blocking)
    {
      this->m_mc = Transpose ? cols : rows;
      this->m_nc = Transpose ? rows : cols;
@@ -351,6 +363,19 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
      m_sizeA = this->m_mc * this->m_kc;
      m_sizeB = this->m_kc * this->m_nc;
    }
+    
+    void initParallel(Index rows, Index cols, Index depth, Index num_threads)
+    {
+      this->m_mc = Transpose ? cols : rows;
+      this->m_nc = Transpose ? rows : cols;
+      this->m_kc = depth;
+      
+      eigen_internal_assert(this->m_blockA==0 && this->m_blockB==0);      
+      Index m = this->m_mc;
+      computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc, num_threads);
+      m_sizeA = this->m_mc * this->m_kc;
+      m_sizeB = this->m_kc * this->m_nc;
+    }

    void allocateA()
    {
@@ -437,6 +462,8 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
  static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha)
  {
    eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols());
+    if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0)
+      return;

    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -125,7 +125,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C

  // How many coeffs of the result do we have to skip to be aligned.
  // Here we assume data are at least aligned on the base scalar type.
-  Index alignedStart = internal::first_aligned(res,size);
+  Index alignedStart = internal::first_default_aligned(res,size);
  Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
  const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;

@@ -463,7 +463,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,R
  Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
  for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
  {
-    EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
+    // FIXME: what is the purpose of this EIGEN_ALIGN_DEFAULT ??
+    EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
    ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);

    // this helps the compiler generating good binary code
@@ -572,7 +573,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,R
  {
    for (Index i=start; i<end; ++i)
    {
-      EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
+      EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
      ResPacket ptmp0 = pset1<ResPacket>(tmp0);
      const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
      // process first unaligned result's coeffs
--- a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
@@ -46,38 +46,37 @@ namespace internal {

 // gemv specialization

-template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
-struct general_matrix_vector_product_gemv :
-  general_matrix_vector_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,ConjugateRhs,BuiltIn> {};
+template<typename Index, typename LhsScalar, int StorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
+struct general_matrix_vector_product_gemv;

 #define EIGEN_MKL_GEMV_SPECIALIZE(Scalar) \
 template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
-struct general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
+struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,Specialized> { \
 static void run( \
  Index rows, Index cols, \
-  const Scalar* lhs, Index lhsStride, \
-  const Scalar* rhs, Index rhsIncr, \
+  const const_blas_data_mapper<Scalar,Index,ColMajor> &lhs, \
+  const const_blas_data_mapper<Scalar,Index,RowMajor> &rhs, \
  Scalar* res, Index resIncr, Scalar alpha) \
 { \
  if (ConjugateLhs) { \
-    general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,BuiltIn>::run( \
-      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+    general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,BuiltIn>::run( \
+      rows, cols, lhs, rhs, res, resIncr, alpha); \
  } else { \
    general_matrix_vector_product_gemv<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
-      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+      rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \
  } \
 } \
 }; \
 template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
-struct general_matrix_vector_product<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
+struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,RowMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ConjugateRhs,Specialized> { \
 static void run( \
  Index rows, Index cols, \
-  const Scalar* lhs, Index lhsStride, \
-  const Scalar* rhs, Index rhsIncr, \
+  const const_blas_data_mapper<Scalar,Index,RowMajor> &lhs, \
+  const const_blas_data_mapper<Scalar,Index,ColMajor> &rhs, \
  Scalar* res, Index resIncr, Scalar alpha) \
 { \
    general_matrix_vector_product_gemv<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
-      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+      rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \
 } \
 }; \

--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -120,25 +120,28 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
    return func(0,rows, 0,cols);

  Eigen::initParallel();
-  func.initParallelSession();
+  func.initParallelSession(threads);

  if(transpose)
    std::swap(rows,cols);
-
-  Index blockCols = (cols / threads) & ~Index(0x3);
-  Index blockRows = (rows / threads);
-  blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
  
  ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0);
-
+  
  #pragma omp parallel num_threads(threads)
  {
    Index i = omp_get_thread_num();
+    // Note that the actual number of threads might be lower than the number of request ones.
+    Index actual_threads = omp_get_num_threads();
+    
+    Index blockCols = (cols / actual_threads) & ~Index(0x3);
+    Index blockRows = (rows / actual_threads);
+    blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
+  
    Index r0 = i*blockRows;
-    Index actualBlockRows = (i+1==threads) ? rows-r0 : blockRows;
+    Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows;

    Index c0 = i*blockCols;
-    Index actualBlockCols = (i+1==threads) ? cols-c0 : blockCols;
+    Index actualBlockCols = (i+1==actual_threads) ? cols-c0 : blockCols;

    info[i].lhs_start = r0;
    info[i].lhs_length = actualBlockRows;
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -94,7 +94,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd

    size_t starti = FirstTriangular ? 0 : j+2;
    size_t endi   = FirstTriangular ? j : size;
-    size_t alignedStart = (starti) + internal::first_aligned(&res[starti], endi-starti);
+    size_t alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti);
    size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);

    // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -257,6 +257,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
    Scalar* _res,        Index resStride,
    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
  {
+    const Index PacketBytes = packet_traits<Scalar>::size*sizeof(Scalar);
    // strip zeros
    Index diagSize  = (std::min)(_cols,_depth);
    Index rows      = _rows;
@@ -274,7 +275,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction

    std::size_t sizeA = kc*mc;
-    std::size_t sizeB = kc*cols+EIGEN_ALIGN_BYTES/sizeof(Scalar);
+    std::size_t sizeB = kc*cols+EIGEN_MAX_ALIGN_BYTES/sizeof(Scalar);

    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
@@ -311,7 +312,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
      Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;

      Scalar* geb = blockB+ts*ts;
-      geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar));
+      geb = geb + internal::first_aligned<PacketBytes>(geb,PacketBytes/sizeof(Scalar));

      pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs);

--- a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
@@ -122,7 +122,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
       Map<const MatrixLhs, 0, OuterStride<> > lhsMap(_lhs,rows,depth,OuterStride<>(lhsStride)); \
       MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \
       MKL_INT aStride = aa_tmp.outerStride(); \
-       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth); \
+       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
       general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
       rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \
 \
@@ -236,7 +236,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
       Map<const MatrixRhs, 0, OuterStride<> > rhsMap(_rhs,depth,cols, OuterStride<>(rhsStride)); \
       MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \
       MKL_INT aStride = aa_tmp.outerStride(); \
-       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth); \
+       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
       general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
       rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \
 \
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -117,8 +117,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
          {
            // TODO write a small kernel handling this (can be shared with trsv)
            Index i  = IsLower ? k2+k1+k : k2-k1-k-1;
-            Index s  = IsLower ? k2+k1 : i+1;
            Index rs = actualPanelWidth - k - 1; // remaining size
+            Index s  = TriStorageOrder==RowMajor ? (IsLower ? k2+k1 : i+1)
+                                                 :  IsLower ? i+1 : i-rs;

            Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(tri(i,i));
            for (Index j=j2; j<j2+actual_cols; ++j)
@@ -135,7 +136,6 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
              }
              else
              {
-                Index s = IsLower ? i+1 : i-rs;
                Scalar b = (other(i,j) *= a);
                Scalar* r = &other(s,j);
                const Scalar* l = &tri(s,i);
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -166,7 +166,7 @@ class BlasLinearMapper {
    return ploadt<HalfPacket, AlignmentType>(m_data + i);
  }

-  EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
+  EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const {
    pstoret<Scalar, Packet, AlignmentType>(m_data + i, p);
  }

@@ -214,7 +214,7 @@ class blas_data_mapper {
  }

  template<typename SubPacket>
-  EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, SubPacket p) const {
+  EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
    pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
  }

@@ -224,12 +224,13 @@ class blas_data_mapper {
  }

  const Index stride() const { return m_stride; }
+  const Scalar* data() const { return m_data; }

  Index firstAligned(Index size) const {
    if (size_t(m_data)%sizeof(Scalar)) {
      return -1;
    }
-    return internal::first_aligned(m_data, size);
+    return internal::first_default_aligned(m_data, size);
  }

  protected:
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -140,7 +140,7 @@ const unsigned int LvalueBit = 0x20;
  */
 const unsigned int DirectAccessBit = 0x40;

-/** \ingroup flags
+/** \deprecated \ingroup flags
  *
  * means the first coefficient packet is guaranteed to be aligned.
  * An expression cannot has the AlignedBit without the PacketAccessBit flag.
@@ -215,12 +215,31 @@ enum {
 };

 /** \ingroup enums
-  * Enum for indicating whether an object is aligned or not. */
+  * Enum for indicating whether a buffer is aligned or not. */
 enum { 
-  /** Object is not correctly aligned for vectorization. */
-  Unaligned=0, 
-  /** Object is aligned for vectorization. */
-  Aligned=1 
+  Unaligned=0,        /**< Data pointer has no specific alignment. */
+  Aligned8=8,         /**< Data pointer is aligned on a 8 bytes boundary. */
+  Aligned16=16,       /**< Data pointer is aligned on a 16 bytes boundary. */
+  Aligned32=32,       /**< Data pointer is aligned on a 32 bytes boundary. */
+  Aligned64=64,       /**< Data pointer is aligned on a 64 bytes boundary. */
+  Aligned128=128,     /**< Data pointer is aligned on a 128 bytes boundary. */
+  AlignedMask=255,
+  Aligned=16,         /**< \deprecated Synonym for Aligned16. */
+#if EIGEN_MAX_ALIGN_BYTES==128
+  AlignedMax = Aligned128
+#elif EIGEN_MAX_ALIGN_BYTES==64
+  AlignedMax = Aligned64
+#elif EIGEN_MAX_ALIGN_BYTES==32
+  AlignedMax = Aligned32
+#elif EIGEN_MAX_ALIGN_BYTES==16
+  AlignedMax = Aligned16
+#elif EIGEN_MAX_ALIGN_BYTES==8
+  AlignedMax = Aligned8
+#elif EIGEN_MAX_ALIGN_BYTES==0
+  AlignedMax = Unaligned
+#else
+#error Invalid value for EIGEN_MAX_ALIGN_BYTES
+#endif
 };

 /** \ingroup enums
@@ -452,8 +471,8 @@ namespace Architecture
 }

 /** \internal \ingroup enums
-  * Enum used as template parameter in GeneralProduct. */
-enum { DefaultProduct=0, CoeffBasedProductMode, LazyCoeffBasedProductMode, LazyProduct, OuterProduct, InnerProduct, GemvProduct, GemmProduct };
+  * Enum used as template parameter in Product and product evalautors. */
+enum { DefaultProduct=0, LazyProduct, AliasFreeProduct, CoeffBasedProductMode, LazyCoeffBasedProductMode, OuterProduct, InnerProduct, GemvProduct, GemmProduct };

 /** \internal \ingroup enums
  * Enum used in experimental parallel implementation. */
@@ -468,6 +487,9 @@ struct Sparse {};
 /** The type used to identify a permutation storage. */
 struct PermutationStorage {};

+/** The type used to identify a permutation storage. */
+struct TranspositionsStorage {};
+
 /** The type used to identify a matrix expression */
 struct MatrixXpr {};

@@ -482,6 +504,7 @@ struct BandShape              { static std::string debugName() { return "BandSha
 struct TriangularShape        { static std::string debugName() { return "TriangularShape"; } };
 struct SelfAdjointShape       { static std::string debugName() { return "SelfAdjointShape"; } };
 struct PermutationShape       { static std::string debugName() { return "PermutationShape"; } };
+struct TranspositionsShape    { static std::string debugName() { return "TranspositionsShape"; } };
 struct SparseShape            { static std::string debugName() { return "SparseShape"; } };

 namespace internal {
@@ -492,6 +515,16 @@ struct IndexBased {};
 // evaluator based on iterators to access coefficients. 
 struct IteratorBased {};

+/** \internal
+ * Constants for comparison functors
+ */
+enum ComparisonName {
+  cmp_EQ = 0,
+  cmp_LT = 1,
+  cmp_LE = 2,
+  cmp_UNORD = 3,
+  cmp_NEQ = 4
+};
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -91,8 +91,6 @@ template<typename NullaryOp, typename MatrixType>         class CwiseNullaryOp;
 template<typename UnaryOp,   typename MatrixType>         class CwiseUnaryOp;
 template<typename ViewOp,    typename MatrixType>         class CwiseUnaryView;
 template<typename BinaryOp,  typename Lhs, typename Rhs>  class CwiseBinaryOp;
-template<typename BinOp,     typename Lhs, typename Rhs>  class SelfCwiseBinaryOp;      // TODO deprecated
-template<typename Derived,   typename Lhs, typename Rhs>  class ProductBase;            // TODO deprecated
 template<typename Decomposition, typename Rhstype>        class Solve;
 template<typename XprType>                                class Inverse;

@@ -102,9 +100,6 @@ namespace internal {

 template<typename Lhs, typename Rhs, int Option = DefaultProduct> class Product;
         
-template<typename Lhs, typename Rhs, int Mode>            class GeneralProduct;         // TODO deprecated
-template<typename Lhs, typename Rhs, int NestingFlags>    class CoeffBasedProduct;      // TODO deprecated
-
 template<typename Derived> class DiagonalBase;
 template<typename _DiagonalVectorType> class DiagonalWrapper;
 template<typename _Scalar, int SizeAtCompileTime, int MaxSizeAtCompileTime=SizeAtCompileTime> class DiagonalMatrix;
@@ -152,6 +147,9 @@ template<typename _Scalar, int Rows=Dynamic, int Cols=Dynamic, int Supers=Dynami

 namespace internal {
 template<typename Lhs, typename Rhs> struct product_type;
+
+template<bool> struct EnableIf;
+
 /** \internal
  * \class product_evaluator
  * Products need their own evaluator with more template arguments allowing for
@@ -162,7 +160,8 @@ template< typename T,
          typename LhsShape = typename evaluator_traits<typename T::Lhs>::Shape,
          typename RhsShape = typename evaluator_traits<typename T::Rhs>::Shape,
          typename LhsScalar = typename traits<typename T::Lhs>::Scalar,
-          typename RhsScalar = typename traits<typename T::Rhs>::Scalar
+          typename RhsScalar = typename traits<typename T::Rhs>::Scalar,
+          typename = EnableIf<true> // extra template parameter for SFINAE-based specialization
        > struct product_evaluator;
 }

@@ -189,6 +188,7 @@ template<typename Scalar> struct scalar_imag_op;
 template<typename Scalar> struct scalar_abs_op;
 template<typename Scalar> struct scalar_abs2_op;
 template<typename Scalar> struct scalar_sqrt_op;
+template<typename Scalar> struct scalar_rsqrt_op;
 template<typename Scalar> struct scalar_exp_op;
 template<typename Scalar> struct scalar_log_op;
 template<typename Scalar> struct scalar_cos_op;
@@ -213,6 +213,7 @@ template<typename Scalar> struct scalar_identity_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_product_op;
 template<typename LhsScalar,typename RhsScalar> struct scalar_multiple2_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_quotient_op;
+template<typename LhsScalar,typename RhsScalar> struct scalar_quotient2_op;

 } // end namespace internal

@@ -264,6 +265,7 @@ template<typename Derived> class QuaternionBase;
 template<typename Scalar> class Rotation2D;
 template<typename Scalar> class AngleAxis;
 template<typename Scalar,int Dim> class Translation;
+template<typename Scalar,int Dim> class AlignedBox;

 template<typename Scalar, int Options = AutoAlign> class Quaternion;
 template<typename Scalar,int Dim,int Mode,int _Options=AutoAlign> class Transform;
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -13,7 +13,7 @@

 #define EIGEN_WORLD_VERSION 3
 #define EIGEN_MAJOR_VERSION 2
-#define EIGEN_MINOR_VERSION 90
+#define EIGEN_MINOR_VERSION 91

 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                      (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
@@ -101,7 +101,7 @@


 /// \internal EIGEN_GNUC_STRICT set to 1 if the compiler is really GCC and not a compatible compiler (e.g., ICC, clang, mingw, etc.)
-#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_CLANG || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM )
+#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM )
  #define EIGEN_COMP_GNUC_STRICT 1
 #else
  #define EIGEN_COMP_GNUC_STRICT 0
@@ -213,7 +213,8 @@
 #endif

 /// \internal EIGEN_OS_ANDROID set to 1 if the OS is Android
-#if defined(__ANDROID__)
+// note: ANDROID is defined when using ndk_build, __ANDROID__ is defined when using a standalone toolchain.
+#if defined(__ANDROID__) || defined(ANDROID)
  #define EIGEN_OS_ANDROID 1
 #else
  #define EIGEN_OS_ANDROID 0
@@ -282,6 +283,19 @@
  #define EIGEN_OS_WIN_STRICT 0
 #endif

+/// \internal EIGEN_OS_SUN set to 1 if the OS is SUN
+#if (defined(sun) || defined(__sun)) && !(defined(__SVR4) || defined(__svr4__))
+  #define EIGEN_OS_SUN 1
+#else
+  #define EIGEN_OS_SUN 0
+#endif
+
+/// \internal EIGEN_OS_SOLARIS set to 1 if the OS is Solaris
+#if (defined(sun) || defined(__sun)) && (defined(__SVR4) || defined(__svr4__))
+  #define EIGEN_OS_SOLARIS 1
+#else
+  #define EIGEN_OS_SOLARIS 0
+#endif



@@ -292,65 +306,10 @@
  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1
 #endif

-// 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
-// 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
-// enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
-// certain common platform (compiler+architecture combinations) to avoid these problems.
-// Only static alignment is really problematic (relies on nonstandard compiler extensions that don't
-// work everywhere, for example don't work on GCC/ARM), try to keep heap alignment even
-// when we have to disable static alignment.
-#if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64)
-#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
-#else
-#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
-#endif
-
-// static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
-#if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
- && !EIGEN_GCC3_OR_OLDER \
- && !EIGEN_COMP_SUNCC \
- && !EIGEN_OS_QNX
-  #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
-#else
-  #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
-#endif
-
-// Defined the boundary (in bytes) on which the data needs to be aligned. Note
-// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
-// aligned at all regardless of the value of this #define.
-#define EIGEN_ALIGN_BYTES 16
-
-#ifdef EIGEN_DONT_ALIGN
-  #ifndef EIGEN_DONT_ALIGN_STATICALLY
-    #define EIGEN_DONT_ALIGN_STATICALLY
-  #endif
-  #define EIGEN_ALIGN 0
-#elif !defined(EIGEN_DONT_VECTORIZE)
-  #if defined(__AVX__)
-    #undef EIGEN_ALIGN_BYTES
-    #define EIGEN_ALIGN_BYTES 32
-  #endif
-  #define EIGEN_ALIGN 1
-#else
-  #define EIGEN_ALIGN 0
-#endif
-
-
 // This macro can be used to prevent from macro expansion, e.g.:
 //   std::max EIGEN_NOT_A_MACRO(a,b)
 #define EIGEN_NOT_A_MACRO

-// EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable
-// alignment (EIGEN_DONT_ALIGN_STATICALLY) and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). Henceforth, only EIGEN_ALIGN_STATICALLY should be used.
-#if EIGEN_ARCH_WANTS_STACK_ALIGNMENT && !defined(EIGEN_DONT_ALIGN_STATICALLY)
-  #define EIGEN_ALIGN_STATICALLY 1
-#else
-  #define EIGEN_ALIGN_STATICALLY 0
-  #ifndef EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
-    #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
-  #endif
-#endif
-
 #ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
 #define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor
 #else
@@ -382,17 +341,44 @@
  #define EIGEN_HAVE_RVALUE_REFERENCES
 #endif

+// Does the compiler support result_of?
+#if (__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L))
+#define EIGEN_HAS_STD_RESULT_OF 1
+#endif
+
 // Does the compiler support variadic templates?
 #if __cplusplus > 199711L
 #define EIGEN_HAS_VARIADIC_TEMPLATES 1
 #endif

 // Does the compiler support const expressions?
-#if (defined(__plusplus) && __cplusplus >= 201402L) || \
+#ifdef __CUDACC__
+  // Const expressions are not supported regardless of what host compiler is used 
+#elif (defined(__cplusplus) && __cplusplus >= 201402L) || \
    EIGEN_GNUC_AT_LEAST(4,9)
 #define EIGEN_HAS_CONSTEXPR 1
 #endif

+// Does the compiler support C++11 math?
+// Let's be conservative and enable the default C++11 implementation only if we are sure it exists
+#ifndef EIGEN_HAS_CXX11_MATH
+  #if (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC)  \
+      && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC)
+    #define EIGEN_HAS_CXX11_MATH 1
+  #else
+    #define EIGEN_HAS_CXX11_MATH 0
+  #endif
+#endif
+
+// Does the compiler support proper C++11 containers?
+#ifndef EIGEN_HAS_CXX11_CONTAINERS
+  #if ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG)) || EIGEN_COMP_MSVC >= 1900
+    #define EIGEN_HAS_CXX11_CONTAINERS 1
+  #else
+    #define EIGEN_HAS_CXX11_CONTAINERS 0
+  #endif
+#endif
+
 /** Allows to disable some optimizations which might affect the accuracy of the result.
  * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
  * They currently include:
@@ -550,6 +536,20 @@ namespace Eigen {
  #endif
 #endif

+
+//------------------------------------------------------------------------------------------
+// Static and dynamic alignment control
+// 
+// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES
+// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively.
+// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not,
+// a default value is automatically computed based on architecture, compiler, and OS.
+// 
+// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX}
+// to be used to declare statically aligned buffers.
+//------------------------------------------------------------------------------------------
+
+
 /* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements.
 * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
 * so that vectorization doesn't affect binary compatibility.
@@ -570,22 +570,124 @@ namespace Eigen {
  #error Please tell me what is the equivalent of __attribute__((aligned(n))) for your compiler
 #endif

+// If the user explicitly disable vectorization, then we also disable alignment
+#if defined(EIGEN_DONT_VECTORIZE)
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
+#elif defined(__AVX__)
+  // 32 bytes static alignmeent is preferred only if really required
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32
+#else
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
+#endif
+
+
+// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense
+#define EIGEN_MIN_ALIGN_BYTES 16
+
+// Defined the boundary (in bytes) on which the data needs to be aligned. Note
+// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
+// aligned at all regardless of the value of this #define.
+
+#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN))  && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
+#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY.
+#endif
+
+// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprectated
+// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0
+#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)
+  #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES
+    #undef EIGEN_MAX_STATIC_ALIGN_BYTES
+  #endif
+  #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
+#endif
+
+#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES
+
+  // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES
+  
+  // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
+  // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
+  // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
+  // certain common platform (compiler+architecture combinations) to avoid these problems.
+  // Only static alignment is really problematic (relies on nonstandard compiler extensions that don't
+  // work everywhere, for example don't work on GCC/ARM), try to keep heap alignment even
+  // when we have to disable static alignment.
+  #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64)
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+  #else
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
+  #endif
+
+  // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
+  #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
+  && !EIGEN_GCC3_OR_OLDER \
+  && !EIGEN_COMP_SUNCC \
+  && !EIGEN_OS_QNX
+    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
+  #else
+    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
+  #endif
+  
+  #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT
+    #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+  #else
+    #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
+  #endif
+  
+#endif
+
+// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_ALIGN_BYTES
+#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES<EIGEN_MAX_STATIC_ALIGN_BYTES
+#undef EIGEN_MAX_STATIC_ALIGN_BYTES
+#define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
+#endif
+
+#if EIGEN_MAX_STATIC_ALIGN_BYTES==0 && !defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
+  #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
+#endif
+
+// At this stage, EIGEN_MAX_STATIC_ALIGN_BYTES>0 is the true test whether we want to align arrays on the stack or not.
+// It takes into account both the user choice to explicitly enable/disable alignment (by settting EIGEN_MAX_STATIC_ALIGN_BYTES)
+// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT).
+// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used.
+
+
+// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY
+#define EIGEN_ALIGN8  EIGEN_ALIGN_TO_BOUNDARY(8)
 #define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
 #define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
-#define EIGEN_ALIGN_DEFAULT EIGEN_ALIGN_TO_BOUNDARY(EIGEN_ALIGN_BYTES)
-
-#if EIGEN_ALIGN_STATICALLY
-#define EIGEN_USER_ALIGN_TO_BOUNDARY(n) EIGEN_ALIGN_TO_BOUNDARY(n)
-#define EIGEN_USER_ALIGN16 EIGEN_ALIGN16
-#define EIGEN_USER_ALIGN32 EIGEN_ALIGN32
-#define EIGEN_USER_ALIGN_DEFAULT EIGEN_ALIGN_DEFAULT
+#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64)
+#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
+#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES)
 #else
-#define EIGEN_USER_ALIGN_TO_BOUNDARY(n)
-#define EIGEN_USER_ALIGN16
-#define EIGEN_USER_ALIGN32
-#define EIGEN_USER_ALIGN_DEFAULT
+#define EIGEN_ALIGN_MAX
 #endif

+
+// Dynamic alignment control
+
+#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0
+#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN.
+#endif
+
+#ifdef EIGEN_DONT_ALIGN
+  #ifdef EIGEN_MAX_ALIGN_BYTES
+    #undef EIGEN_MAX_ALIGN_BYTES
+  #endif
+  #define EIGEN_MAX_ALIGN_BYTES 0
+#elif !defined(EIGEN_MAX_ALIGN_BYTES)
+  #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+#endif
+
+#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES
+#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+#else
+#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
+#endif
+
+//----------------------------------------------------------------------
+
+
 #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD
  #define EIGEN_RESTRICT
 #endif
@@ -611,7 +713,7 @@ namespace Eigen {
 // just an empty macro !
 #define EIGEN_EMPTY

-#if EIGEN_COMP_MSVC_STRICT && EIGEN_COMP_MSVC < 1900
+#if EIGEN_COMP_MSVC_STRICT && EIGEN_COMP_MSVC < 1900 // for older MSVC versions using the base operator is sufficient (cf Bug 1000)
  #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
    using Base::operator =;
 #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
@@ -630,6 +732,11 @@ namespace Eigen {
    }
 #endif

+
+/** \internal
+ * \brief Macro to manually inherit assignment operators.
+ * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined.
+ */
 #define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)

 /**
@@ -646,7 +753,7 @@ namespace Eigen {
  typedef typename Eigen::internal::traits<Derived>::Scalar Scalar; /*!< \brief Numeric type, e.g. float, double, int or std::complex<float>. */ \
  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; /*!< \brief The underlying numeric type for composed scalar types. \details In cases where Scalar is e.g. std::complex<T>, T were corresponding to RealScalar. */ \
  typedef typename Base::CoeffReturnType CoeffReturnType; /*!< \brief The return type for coefficient access. \details Depending on whether the object allows direct coefficient access (e.g. for a MatrixXd), this type is either 'const Scalar&' or simply 'Scalar' for objects that do not allow direct coefficient access. */ \
-  typedef typename Eigen::internal::nested<Derived>::type Nested; \
+  typedef typename Eigen::internal::ref_selector<Derived>::type Nested; \
  typedef typename Eigen::internal::traits<Derived>::StorageKind StorageKind; \
  typedef typename Eigen::internal::traits<Derived>::StorageIndex StorageIndex; \
  enum { RowsAtCompileTime = Eigen::internal::traits<Derived>::RowsAtCompileTime, \
@@ -719,8 +826,13 @@ namespace Eigen {
 #  define EIGEN_TRY try
 #  define EIGEN_CATCH(X) catch (X)
 #else
-#  define EIGEN_THROW_X(X) std::abort()
-#  define EIGEN_THROW std::abort()
+#  ifdef __CUDA_ARCH__
+#    define EIGEN_THROW_X(X) asm("trap;") return {}
+#    define EIGEN_THROW asm("trap;"); return {}
+#  else
+#    define EIGEN_THROW_X(X) std::abort()
+#    define EIGEN_THROW std::abort()
+#  endif
 #  define EIGEN_TRY if (true)
 #  define EIGEN_CATCH(X) else
 #endif
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2008-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
 // Copyright (C) 2009 Kenneth Riddile <kfriddile@yahoo.com>
 // Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
@@ -32,7 +32,7 @@
 // page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed
 // quite safe, at least within the context of glibc, to equate 64-bit with LP64.
 #if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \
- && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && (EIGEN_ALIGN_BYTES == 16)
+ && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && (EIGEN_DEFAULT_ALIGN_BYTES == 16)
  #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1
 #else
  #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0
@@ -42,14 +42,14 @@
 //   See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup
 // FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures
 //   See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup
-#if defined(__FreeBSD__) && !(EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) && (EIGEN_ALIGN_BYTES == 16)
+#if defined(__FreeBSD__) && !(EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) && (EIGEN_DEFAULT_ALIGN_BYTES == 16)
  #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1
 #else
  #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0
 #endif

-#if (EIGEN_OS_MAC && (EIGEN_ALIGN_BYTES == 16))     \
- || (EIGEN_OS_WIN64 && (EIGEN_ALIGN_BYTES == 16))   \
+#if (EIGEN_OS_MAC && (EIGEN_DEFAULT_ALIGN_BYTES == 16))     \
+ || (EIGEN_OS_WIN64 && (EIGEN_DEFAULT_ALIGN_BYTES == 16))   \
 || EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED              \
 || EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED
  #define EIGEN_MALLOC_ALREADY_ALIGNED 1
@@ -59,18 +59,20 @@

 #endif

-// See bug 554 (http://eigen.tuxfamily.org/bz/show_bug.cgi?id=554)
-// It seems to be unsafe to check _POSIX_ADVISORY_INFO without including unistd.h first.
-// Currently, let's include it only on unix systems:
-#if EIGEN_OS_UNIX
-  #include <unistd.h>
-  #if (EIGEN_OS_QNX || (defined _GNU_SOURCE) || EIGEN_COMP_PGI || ((defined _XOPEN_SOURCE) && (_XOPEN_SOURCE >= 600))) && (defined _POSIX_ADVISORY_INFO) && (_POSIX_ADVISORY_INFO > 0)
-    #define EIGEN_HAS_POSIX_MEMALIGN 1
-  #endif
-#endif
-
 #ifndef EIGEN_HAS_POSIX_MEMALIGN
-  #define EIGEN_HAS_POSIX_MEMALIGN 0
+  // See bug 554 (http://eigen.tuxfamily.org/bz/show_bug.cgi?id=554)
+  // It seems to be unsafe to check _POSIX_ADVISORY_INFO without including unistd.h first.
+  // Currently, let's include it only on unix systems:
+  #if EIGEN_OS_UNIX && !(EIGEN_OS_SUN || EIGEN_OS_SOLARIS)
+    #include <unistd.h>
+    #if (EIGEN_OS_QNX || (defined _GNU_SOURCE) || EIGEN_COMP_PGI || ((defined _XOPEN_SOURCE) && (_XOPEN_SOURCE >= 600))) && (defined _POSIX_ADVISORY_INFO) && (_POSIX_ADVISORY_INFO > 0)
+      #define EIGEN_HAS_POSIX_MEMALIGN 1
+    #endif
+  #endif
+
+  #ifndef EIGEN_HAS_POSIX_MEMALIGN
+    #define EIGEN_HAS_POSIX_MEMALIGN 0
+  #endif
 #endif

 #if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_AVX
@@ -105,9 +107,9 @@ inline void throw_std_bad_alloc()
  */
 inline void* handmade_aligned_malloc(std::size_t size)
 {
-  void *original = std::malloc(size+EIGEN_ALIGN_BYTES);
+  void *original = std::malloc(size+EIGEN_DEFAULT_ALIGN_BYTES);
  if (original == 0) return 0;
-  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES);
+  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) + EIGEN_DEFAULT_ALIGN_BYTES);
  *(reinterpret_cast<void**>(aligned) - 1) = original;
  return aligned;
 }
@@ -128,9 +130,9 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t =
  if (ptr == 0) return handmade_aligned_malloc(size);
  void *original = *(reinterpret_cast<void**>(ptr) - 1);
  std::ptrdiff_t previous_offset = static_cast<char *>(ptr)-static_cast<char *>(original);
-  original = std::realloc(original,size+EIGEN_ALIGN_BYTES);
+  original = std::realloc(original,size+EIGEN_DEFAULT_ALIGN_BYTES);
  if (original == 0) return 0;
-  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES);
+  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) + EIGEN_DEFAULT_ALIGN_BYTES);
  void *previous_aligned = static_cast<char *>(original)+previous_offset;
  if(aligned!=previous_aligned)
    std::memmove(aligned, previous_aligned, size);
@@ -216,16 +218,16 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size)
  check_that_malloc_is_allowed();

  void *result;
-  #if !EIGEN_ALIGN
+  #if EIGEN_DEFAULT_ALIGN_BYTES==0
    result = std::malloc(size);
  #elif EIGEN_MALLOC_ALREADY_ALIGNED
    result = std::malloc(size);
  #elif EIGEN_HAS_POSIX_MEMALIGN
-    if(posix_memalign(&result, EIGEN_ALIGN_BYTES, size)) result = 0;
+    if(posix_memalign(&result, EIGEN_DEFAULT_ALIGN_BYTES, size)) result = 0;
  #elif EIGEN_HAS_MM_MALLOC
-    result = _mm_malloc(size, EIGEN_ALIGN_BYTES);
+    result = _mm_malloc(size, EIGEN_DEFAULT_ALIGN_BYTES);
  #elif EIGEN_OS_WIN_STRICT
-    result = _aligned_malloc(size, EIGEN_ALIGN_BYTES);
+    result = _aligned_malloc(size, EIGEN_DEFAULT_ALIGN_BYTES);
  #else
    result = handmade_aligned_malloc(size);
  #endif
@@ -239,7 +241,7 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size)
 /** \internal Frees memory allocated with aligned_malloc. */
 EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
 {
-  #if !EIGEN_ALIGN
+  #if EIGEN_DEFAULT_ALIGN_BYTES==0
    std::free(ptr);
  #elif EIGEN_MALLOC_ALREADY_ALIGNED
    std::free(ptr);
@@ -264,7 +266,7 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
  EIGEN_UNUSED_VARIABLE(old_size);

  void *result;
-#if !EIGEN_ALIGN
+#if EIGEN_DEFAULT_ALIGN_BYTES==0
  result = std::realloc(ptr,new_size);
 #elif EIGEN_MALLOC_ALREADY_ALIGNED
  result = std::realloc(ptr,new_size);
@@ -275,12 +277,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
  // implements _mm_malloc/_mm_free based on the corresponding _aligned_
  // functions. This may not always be the case and we just try to be safe.
  #if EIGEN_OS_WIN_STRICT && defined(_mm_free)
-    result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN_BYTES);
+    result = _aligned_realloc(ptr,new_size,EIGEN_DEFAULT_ALIGN_BYTES);
  #else
    result = generic_aligned_realloc(ptr,new_size,old_size);
  #endif
 #elif EIGEN_OS_WIN_STRICT
-  result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN_BYTES);
+  result = _aligned_realloc(ptr,new_size,EIGEN_DEFAULT_ALIGN_BYTES);
 #else
  result = handmade_aligned_realloc(ptr,new_size,old_size);
 #endif
@@ -504,47 +506,57 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_align

 /****************************************************************************/

-/** \internal Returns the index of the first element of the array that is well aligned for vectorization.
+/** \internal Returns the index of the first element of the array that is well aligned with respect to the requested \a Alignment.
  *
+  * \tparam Alignment requested alignment in Bytes.
  * \param array the address of the start of the array
  * \param size the size of the array
  *
-  * \note If no element of the array is well aligned, the size of the array is returned. Typically,
-  * for example with SSE, "well aligned" means 16-byte-aligned. If vectorization is disabled or if the
+  * \note If no element of the array is well aligned or the requested alignment is not a multiple of a scalar,
+  * the size of the array is returned. For example with SSE, the requested alignment is typically 16-bytes. If
  * packet size for the given scalar type is 1, then everything is considered well-aligned.
  *
-  * \note If the scalar type is vectorizable, we rely on the following assumptions: sizeof(Scalar) is a
-  * power of 2, the packet size in bytes is also a power of 2, and is a multiple of sizeof(Scalar). On the
-  * other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for
+  * \note Otherwise, if the Alignment is larger that the scalar size, we rely on the assumptions that sizeof(Scalar) is a
+  * power of 2. On the other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for
  * example with Scalar=double on certain 32-bit platforms, see bug #79.
  *
  * There is also the variant first_aligned(const MatrixBase&) defined in DenseCoeffsBase.h.
+  * \sa first_default_aligned()
  */
-template<typename Scalar, typename Index>
+template<int Alignment, typename Scalar, typename Index>
 inline Index first_aligned(const Scalar* array, Index size)
 {
-  static const Index PacketSize = packet_traits<Scalar>::size;
-  static const Index PacketAlignedMask = PacketSize-1;
+  static const Index ScalarSize = sizeof(Scalar);
+  static const Index AlignmentSize = Alignment / ScalarSize;
+  static const Index AlignmentMask = AlignmentSize-1;

-  if(PacketSize==1)
+  if(AlignmentSize<=1)
  {
-    // Either there is no vectorization, or a packet consists of exactly 1 scalar so that all elements
-    // of the array have the same alignment.
+    // Either the requested alignment if smaller than a scalar, or it exactly match a 1 scalar
+    // so that all elements of the array have the same alignment.
    return 0;
  }
-  else if(size_t(array) & (sizeof(Scalar)-1))
+  else if( (std::size_t(array) & (sizeof(Scalar)-1)) || (Alignment%ScalarSize)!=0)
  {
-    // There is vectorization for this scalar type, but the array is not aligned to the size of a single scalar.
+    // The array is not aligned to the size of a single scalar, or the requested alignment is not a multiple of the scalar size.
    // Consequently, no element of the array is well aligned.
    return size;
  }
  else
  {
-    return std::min<Index>( (PacketSize - (Index((size_t(array)/sizeof(Scalar))) & PacketAlignedMask))
-                           & PacketAlignedMask, size);
+    return std::min<Index>( (AlignmentSize - (Index((std::size_t(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask, size);
  }
 }

+/** \internal Returns the index of the first element of the array that is well aligned with respect the largest packet requirement.
+   * \sa first_aligned(Scalar*,Index) and first_default_aligned(DenseBase<Derived>) */
+template<typename Scalar, typename Index>
+inline Index first_default_aligned(const Scalar* array, Index size)
+{
+  typedef typename packet_traits<Scalar>::type DefaultPacketType;
+  return first_aligned<unpacket_traits<DefaultPacketType>::alignment>(array, size);
+}
+
 /** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size
  */ 
 template<typename Index> 
@@ -557,18 +569,18 @@ inline Index first_multiple(Index size, Index base)
 // use memcpy on trivial types, i.e., on types that does not require an initialization ctor.
 template<typename T, bool UseMemcpy> struct smart_copy_helper;

-template<typename T> void smart_copy(const T* start, const T* end, T* target)
+template<typename T> EIGEN_DEVICE_FUNC void smart_copy(const T* start, const T* end, T* target)
 {
  smart_copy_helper<T,!NumTraits<T>::RequireInitialization>::run(start, end, target);
 }

 template<typename T> struct smart_copy_helper<T,true> {
-  static inline void run(const T* start, const T* end, T* target)
+  EIGEN_DEVICE_FUNC static inline void run(const T* start, const T* end, T* target)
  { memcpy(target, start, std::ptrdiff_t(end)-std::ptrdiff_t(start)); }
 };

 template<typename T> struct smart_copy_helper<T,false> {
-  static inline void run(const T* start, const T* end, T* target)
+  EIGEN_DEVICE_FUNC static inline void run(const T* start, const T* end, T* target)
  { std::copy(start, end, target); }
 };

@@ -687,9 +699,14 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
  * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
  */
 #ifdef EIGEN_ALLOCA
-  // We always manually re-align the result of EIGEN_ALLOCA.
-  // If alloca is already aligned, the compiler should be smart enough to optimize away the re-alignment.
-  #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<size_t>(EIGEN_ALLOCA(SIZE+EIGEN_ALIGN_BYTES-1)) + EIGEN_ALIGN_BYTES-1) & ~(size_t(EIGEN_ALIGN_BYTES-1)))
+  
+  #if EIGEN_DEFAULT_ALIGN_BYTES>0
+    // We always manually re-align the result of EIGEN_ALLOCA.
+    // If alloca is already aligned, the compiler should be smart enough to optimize away the re-alignment.
+    #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<std::size_t>(EIGEN_ALLOCA(SIZE+EIGEN_DEFAULT_ALIGN_BYTES-1)) + EIGEN_DEFAULT_ALIGN_BYTES-1) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1)))
+  #else
+    #define EIGEN_ALIGNED_ALLOCA(SIZE) EIGEN_ALLOCA(SIZE)
+  #endif

  #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
    Eigen::internal::check_size_for_overflow<TYPE>(SIZE); \
@@ -713,7 +730,7 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
 *** Implementation of EIGEN_MAKE_ALIGNED_OPERATOR_NEW [_IF]                ***
 *****************************************************************************/

-#if EIGEN_ALIGN
+#if EIGEN_MAX_ALIGN_BYTES!=0
  #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
      void* operator new(size_t size, const std::nothrow_t&) throw() { \
        EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); } \
@@ -749,7 +766,7 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)

 #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true)
 #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_ALIGN_BYTES==0)))
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_MAX_ALIGN_BYTES==0)))

 /****************************************************************************/

--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -117,6 +117,10 @@ template<typename T> struct enable_if<true,T>
 { typedef T type; };

 #if defined(__CUDA_ARCH__)
+#if !defined(__FLT_EPSILON__)
+#define __FLT_EPSILON__ FLT_EPSILON
+#define __DBL_EPSILON__ DBL_EPSILON
+#endif

 namespace device {

@@ -124,16 +128,53 @@ template<typename T> struct numeric_limits
 {
  EIGEN_DEVICE_FUNC
  static T epsilon() { return 0; }
+  static T (max)() { assert(false && "Highest not supported for this type"); }
+  static T (min)() { assert(false && "Lowest not supported for this type"); }
 };
 template<> struct numeric_limits<float>
 {
  EIGEN_DEVICE_FUNC
  static float epsilon() { return __FLT_EPSILON__; }
+  EIGEN_DEVICE_FUNC
+  static float (max)() { return CUDART_MAX_NORMAL_F; }
+  EIGEN_DEVICE_FUNC
+  static float (min)() { return __FLT_EPSILON__; }
 };
 template<> struct numeric_limits<double>
 {
  EIGEN_DEVICE_FUNC
  static double epsilon() { return __DBL_EPSILON__; }
+  EIGEN_DEVICE_FUNC
+  static double (max)() { return CUDART_INF; }
+  EIGEN_DEVICE_FUNC
+  static double (min)() { return __DBL_EPSILON__; }
+};
+template<> struct numeric_limits<int>
+{
+  EIGEN_DEVICE_FUNC
+  static int epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static int (max)() { return INT_MAX; }
+  EIGEN_DEVICE_FUNC
+  static int (min)() { return INT_MIN; }
+};
+template<> struct numeric_limits<long>
+{
+  EIGEN_DEVICE_FUNC
+  static long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static long (max)() { return LONG_MAX; }
+  EIGEN_DEVICE_FUNC
+  static long (min)() { return LONG_MIN; }
+};
+template<> struct numeric_limits<long long>
+{
+  EIGEN_DEVICE_FUNC
+  static long long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static long long (max)() { return LLONG_MAX; }
+  EIGEN_DEVICE_FUNC
+  static long long (min)() { return LLONG_MIN; }
 };

 }
@@ -145,11 +186,11 @@ template<> struct numeric_limits<double>
  */
 class noncopyable
 {
-  noncopyable(const noncopyable&);
-  const noncopyable& operator=(const noncopyable&);
+  EIGEN_DEVICE_FUNC noncopyable(const noncopyable&);
+  EIGEN_DEVICE_FUNC const noncopyable& operator=(const noncopyable&);
 protected:
-  noncopyable() {}
-  ~noncopyable() {}
+  EIGEN_DEVICE_FUNC noncopyable() {}
+  EIGEN_DEVICE_FUNC ~noncopyable() {}
 };


@@ -160,7 +201,13 @@ protected:
  * upcoming next STL generation (using a templated result member).
  * If none of these members is provided, then the type of the first argument is returned. FIXME, that behavior is a pretty bad hack.
  */
-template<typename T> struct result_of {};
+#ifdef EIGEN_HAS_STD_RESULT_OF
+template<typename T> struct result_of {
+  typedef typename std::result_of<T>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#else
+template<typename T> struct result_of { };

 struct has_none {int a[1];};
 struct has_std_result_type {int a[2];};
@@ -178,10 +225,10 @@ struct unary_result_of_select<Func, ArgType, sizeof(has_tr1_result)> {typedef ty
 template<typename Func, typename ArgType>
 struct result_of<Func(ArgType)> {
    template<typename T>
-    static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0);
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
    template<typename T>
-    static has_tr1_result      testFunctor(T const *, typename T::template result<T(ArgType)>::type const * = 0);
-    static has_none            testFunctor(...);
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType)>::type const * = 0);
+    static has_none               testFunctor(...);

    // note that the following indirection is needed for gcc-3.3
    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
@@ -202,15 +249,16 @@ struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_tr1_result)>
 template<typename Func, typename ArgType0, typename ArgType1>
 struct result_of<Func(ArgType0,ArgType1)> {
    template<typename T>
-    static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0);
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
    template<typename T>
-    static has_tr1_result      testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1)>::type const * = 0);
-    static has_none            testFunctor(...);
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1)>::type const * = 0);
+    static has_none               testFunctor(...);

    // note that the following indirection is needed for gcc-3.3
    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
    typedef typename binary_result_of_select<Func, ArgType0, ArgType1, FunctorType>::type type;
 };
+#endif

 /** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer.
  * Usage example: \code meta_sqrt<1023>::ret \endcode
@@ -284,6 +332,14 @@ template<typename T> EIGEN_DEVICE_FUNC   void swap(T &a, T &b) { T tmp = b; b =
 template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
 #endif

+// Integer division with rounding up.
+// T is assumed to be an integer type with a>=0, and b>0
+template<typename T>
+T div_ceil(const T &a, const T &b)
+{
+  return (a+b-1) / b;
+}
+
 } // end namespace numext

 } // end namespace Eigen
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -119,7 +119,72 @@ template<typename T> struct unpacket_traits
 {
  typedef T type;
  typedef T half;
-  enum {size=1};
+  enum
+  {
+    size = 1,
+    alignment = 1
+  };
+};
+
+template<int Size, typename PacketType,
+         bool Stop = Size==Dynamic || (Size%unpacket_traits<PacketType>::size)==0 || is_same<PacketType,typename unpacket_traits<PacketType>::half>::value>
+struct find_best_packet_helper;
+
+template< int Size, typename PacketType>
+struct find_best_packet_helper<Size,PacketType,true>
+{
+  typedef PacketType type;
+};
+
+template<int Size, typename PacketType>
+struct find_best_packet_helper<Size,PacketType,false>
+{
+  typedef typename find_best_packet_helper<Size,typename unpacket_traits<PacketType>::half>::type type;
+};
+
+template<typename T, int Size>
+struct find_best_packet
+{
+  typedef typename find_best_packet_helper<Size,typename packet_traits<T>::type>::type type;
+};
+
+#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
+template<int ArrayBytes, int AlignmentBytes,
+         bool Match     =  bool((ArrayBytes%AlignmentBytes)==0),
+         bool TryHalf   =  bool(AlignmentBytes>EIGEN_MIN_ALIGN_BYTES) >
+struct compute_default_alignment_helper
+{
+  enum { value = 0 };
+};
+
+template<int ArrayBytes, int AlignmentBytes, bool TryHalf>
+struct compute_default_alignment_helper<ArrayBytes, AlignmentBytes, true, TryHalf> // Match
+{
+  enum { value = AlignmentBytes };
+};
+
+template<int ArrayBytes, int AlignmentBytes>
+struct compute_default_alignment_helper<ArrayBytes, AlignmentBytes, false, true> // Try-half
+{
+  // current packet too large, try with an half-packet
+  enum { value = compute_default_alignment_helper<ArrayBytes, AlignmentBytes/2>::value };
+};
+#else
+// If static alignment is disabled, no need to bother.
+// This also avoids a division by zero in "bool Match =  bool((ArrayBytes%AlignmentBytes)==0)"
+template<int ArrayBytes, int AlignmentBytes>
+struct compute_default_alignment_helper
+{
+  enum { value = 0 };
+};
+#endif
+
+template<typename T, int Size> struct compute_default_alignment {
+  enum { value = compute_default_alignment_helper<Size*sizeof(T),EIGEN_MAX_STATIC_ALIGN_BYTES>::value };
+};
+
+template<typename T> struct compute_default_alignment<T,Dynamic> {
+  enum { value = EIGEN_MAX_ALIGN_BYTES };
 };

 template<typename _Scalar, int _Rows, int _Cols,
@@ -153,40 +218,6 @@ class compute_matrix_flags
    enum { ret = DirectAccessBit | LvalueBit | NestByRefBit | row_major_bit };
 };

-template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
-class compute_matrix_evaluator_flags
-{
-    enum {
-      row_major_bit = Options&RowMajor ? RowMajorBit : 0,
-      is_dynamic_size_storage = MaxRows==Dynamic || MaxCols==Dynamic,
-
-      aligned_bit =
-      (
-            ((Options&DontAlign)==0)
-        && (
-#if EIGEN_ALIGN_STATICALLY
-             ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0))
-#else
-             0
-#endif
-
-          ||
-
-#if EIGEN_ALIGN
-             is_dynamic_size_storage
-#else
-             0
-#endif
-
-          )
-      ) ? AlignedBit : 0,
-      packet_access_bit = packet_traits<Scalar>::Vectorizable && aligned_bit ? PacketAccessBit : 0
-    };
-
-  public:
-    enum { ret = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit | aligned_bit };
-};
-
 template<int _Rows, int _Cols> struct size_at_compile_time
 {
  enum { ret = (_Rows==Dynamic || _Cols==Dynamic) ? Dynamic : _Rows * _Cols };
@@ -309,9 +340,6 @@ template<typename T> struct plain_matrix_type_row_major
          > type;
 };

-// we should be able to get rid of this one too
-template<typename T> struct must_nest_by_value { enum { ret = false }; };
-
 /** \internal The reference selector for template expressions. The idea is that we don't
  * need to use references for expressions since they are light weight proxy
  * objects which should generate no copying overhead. */
@@ -323,6 +351,12 @@ struct ref_selector
    T const&,
    const T
  >::type type;
+  
+  typedef typename conditional<
+    bool(traits<T>::Flags & NestByRefBit),
+    T &,
+    T
+  >::type non_const_type;
 };

 /** \internal Adds the const qualifier on the value-type of T2 if and only if T1 is a const type */
@@ -337,13 +371,6 @@ struct transfer_constness
 };


-// When using evaluators, we never evaluate when assembling the expression!!
-// TODO: get rid of this nested class since it's just an alias for ref_selector.
-template<typename T, int n=1, typename PlainObject = void> struct nested
-{
-  typedef typename ref_selector<T>::type type;
-};
-
 // However, we still need a mechanism to detect whether an expression which is evaluated multiple time
 // has to be evaluated into a temporary.
 // That's the purpose of this new nested_eval helper:
@@ -428,7 +455,9 @@ struct special_scalar_op_base : public DenseCoeffsBase<Derived>
 {
  // dummy operator* so that the
  // "using special_scalar_op_base::operator*" compiles
-  void operator*() const;
+  struct dummy {};
+  void operator*(dummy) const;
+  void operator/(dummy) const;
 };

 template<typename Derived,typename Scalar,typename OtherScalar>
@@ -452,6 +481,16 @@ struct special_scalar_op_base<Derived,Scalar,OtherScalar,true>  : public DenseCo
 #endif
    return static_cast<const special_scalar_op_base&>(matrix).operator*(scalar);
  }
+  
+  const CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, Derived>
+  operator/(const OtherScalar& scalar) const
+  {
+#ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
+    EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
+#endif
+    return CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, Derived>
+      (*static_cast<const Derived*>(this), scalar_quotient2_op<Scalar,OtherScalar>(scalar));
+  }
 };

 template<typename XprType, typename CastType> struct cast_return_type
@@ -603,6 +642,18 @@ template<typename T, int S> struct is_diagonal<DiagonalMatrix<T,S> >
 template<typename S1, typename S2> struct glue_shapes;
 template<> struct glue_shapes<DenseShape,TriangularShape> { typedef TriangularShape type;  };

+template<typename T1, typename T2>
+bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<has_direct_access<T1>::ret&&has_direct_access<T2>::ret, T1>::type * = 0)
+{
+  return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride());
+}
+
+template<typename T1, typename T2>
+bool is_same_dense(const T1 &, const T2 &, typename enable_if<!(has_direct_access<T1>::ret&&has_direct_access<T2>::ret), T1>::type * = 0)
+{
+  return false;
+}
+
 } // end namespace internal

 // we require Lhs and Rhs to have the same scalar type. Currently there is no example of a binary functor
--- a/Eigen/src/Eigenvalues/ComplexEigenSolver.h
+++ b/Eigen/src/Eigenvalues/ComplexEigenSolver.h
@@ -234,6 +234,12 @@ template<typename _MatrixType> class ComplexEigenSolver
    }

  protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
    EigenvectorType m_eivec;
    EigenvalueType m_eivalues;
    ComplexSchur<MatrixType> m_schur;
@@ -251,6 +257,8 @@ template<typename MatrixType>
 ComplexEigenSolver<MatrixType>& 
 ComplexEigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvectors)
 {
+  check_template_parameters();
+  
  // this code is inspired from Jampack
  eigen_assert(matrix.cols() == matrix.rows());

--- a/Eigen/src/Eigenvalues/EigenSolver.h
+++ b/Eigen/src/Eigenvalues/EigenSolver.h
@@ -299,6 +299,13 @@ template<typename _MatrixType> class EigenSolver
    void doComputeEigenvectors();

  protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+      EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL);
+    }
+    
    MatrixType m_eivec;
    EigenvalueType m_eivalues;
    bool m_isInitialized;
@@ -366,6 +373,8 @@ template<typename MatrixType>
 EigenSolver<MatrixType>& 
 EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvectors)
 {
+  check_template_parameters();
+  
  using std::sqrt;
  using std::abs;
  using numext::isfinite;
@@ -390,7 +399,7 @@ EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvect
      if (i == matrix.cols() - 1 || m_matT.coeff(i+1, i) == Scalar(0)) 
      {
        m_eivalues.coeffRef(i) = m_matT.coeff(i, i);
-        if(!isfinite(m_eivalues.coeffRef(i)))
+        if(!(isfinite)(m_eivalues.coeffRef(i)))
        {
          m_isInitialized = true;
          m_eigenvectorsOk = false;
@@ -408,7 +417,7 @@ EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvect
        {
          Scalar t0 = m_matT.coeff(i+1, i);
          Scalar t1 = m_matT.coeff(i, i+1);
-          Scalar maxval = numext::maxi(abs(p),numext::maxi(abs(t0),abs(t1)));
+          Scalar maxval = numext::maxi<Scalar>(abs(p),numext::maxi<Scalar>(abs(t0),abs(t1)));
          t0 /= maxval;
          t1 /= maxval;
          Scalar p0 = p/maxval;
@@ -417,7 +426,7 @@ EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvect
        
        m_eivalues.coeffRef(i)   = ComplexScalar(m_matT.coeff(i+1, i+1) + p, z);
        m_eivalues.coeffRef(i+1) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, -z);
-        if(!(isfinite(m_eivalues.coeffRef(i)) && isfinite(m_eivalues.coeffRef(i+1))))
+        if(!((isfinite)(m_eivalues.coeffRef(i)) && (isfinite)(m_eivalues.coeffRef(i+1))))
        {
          m_isInitialized = true;
          m_eigenvectorsOk = false;
@@ -475,7 +484,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
  }
  
  // Backsubstitute to find vectors of upper triangular form
-  if (norm == 0.0)
+  if (norm == Scalar(0))
  {
    return;
  }
@@ -497,7 +506,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
        Scalar w = m_matT.coeff(i,i) - p;
        Scalar r = m_matT.row(i).segment(l,n-l+1).dot(m_matT.col(n).segment(l, n-l+1));

-        if (m_eivalues.coeff(i).imag() < 0.0)
+        if (m_eivalues.coeff(i).imag() < Scalar(0))
        {
          lastw = w;
          lastr = r;
@@ -505,9 +514,9 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
        else
        {
          l = i;
-          if (m_eivalues.coeff(i).imag() == 0.0)
+          if (m_eivalues.coeff(i).imag() == Scalar(0))
          {
-            if (w != 0.0)
+            if (w != Scalar(0))
              m_matT.coeffRef(i,n) = -r / w;
            else
              m_matT.coeffRef(i,n) = -r / (eps * norm);
@@ -545,19 +554,19 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
      }
      else
      {
-        std::complex<Scalar> cc = cdiv<Scalar>(0.0,-m_matT.coeff(n-1,n),m_matT.coeff(n-1,n-1)-p,q);
+        std::complex<Scalar> cc = cdiv<Scalar>(Scalar(0),-m_matT.coeff(n-1,n),m_matT.coeff(n-1,n-1)-p,q);
        m_matT.coeffRef(n-1,n-1) = numext::real(cc);
        m_matT.coeffRef(n-1,n) = numext::imag(cc);
      }
-      m_matT.coeffRef(n,n-1) = 0.0;
-      m_matT.coeffRef(n,n) = 1.0;
+      m_matT.coeffRef(n,n-1) = Scalar(0);
+      m_matT.coeffRef(n,n) = Scalar(1);
      for (Index i = n-2; i >= 0; i--)
      {
        Scalar ra = m_matT.row(i).segment(l, n-l+1).dot(m_matT.col(n-1).segment(l, n-l+1));
        Scalar sa = m_matT.row(i).segment(l, n-l+1).dot(m_matT.col(n).segment(l, n-l+1));
        Scalar w = m_matT.coeff(i,i) - p;

-        if (m_eivalues.coeff(i).imag() < 0.0)
+        if (m_eivalues.coeff(i).imag() < Scalar(0))
        {
          lastw = w;
          lastra = ra;
@@ -579,7 +588,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
            Scalar y = m_matT.coeff(i+1,i);
            Scalar vr = (m_eivalues.coeff(i).real() - p) * (m_eivalues.coeff(i).real() - p) + m_eivalues.coeff(i).imag() * m_eivalues.coeff(i).imag() - q * q;
            Scalar vi = (m_eivalues.coeff(i).real() - p) * Scalar(2) * q;
-            if ((vr == 0.0) && (vi == 0.0))
+            if ((vr == Scalar(0)) && (vi == Scalar(0)))
              vr = eps * norm * (abs(w) + abs(q) + abs(x) + abs(y) + abs(lastw));

            std::complex<Scalar> cc = cdiv(x*lastra-lastw*ra+q*sa,x*lastsa-lastw*sa-q*ra,vr,vi);
@@ -599,7 +608,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
          }

          // Overflow control
-          Scalar t = numext::maxi(abs(m_matT.coeff(i,n-1)),abs(m_matT.coeff(i,n)));
+          Scalar t = numext::maxi<Scalar>(abs(m_matT.coeff(i,n-1)),abs(m_matT.coeff(i,n)));
          if ((eps * t) * t > Scalar(1))
            m_matT.block(i, n-1, size-i, 2) /= t;

--- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
+++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
@@ -263,6 +263,13 @@ template<typename _MatrixType> class GeneralizedEigenSolver
    }

  protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+      EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL);
+    }
+    
    MatrixType m_eivec;
    ComplexVectorType m_alphas;
    VectorType m_betas;
@@ -290,6 +297,8 @@ template<typename MatrixType>
 GeneralizedEigenSolver<MatrixType>&
 GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixType& B, bool computeEigenvectors)
 {
+  check_template_parameters();
+  
  using std::sqrt;
  using std::abs;
  eigen_assert(A.cols() == A.rows() && B.cols() == A.rows() && B.cols() == B.rows());
--- a/Show More
+++ b/Show More