Bump to 3.3.2

Defer set-to-zero in triangular = product so that no aliasing issue occur in the common:
A.triangularView() = B*A.sefladjointView()*B.adjoint() case that used to work in 3.2. (grafted from 655ba783f8 )
2026-04-10 11:34:33 +08:00 · 2017-01-18 15:06:40 +01:00 · 2017-01-17 18:03:35 +01:00 · 2017-01-16 13:36:56 +01:00 · 2017-01-06 18:01:29 +01:00 · 2017-01-06 15:44:13 +01:00
112 changed files with 3330 additions and 1628 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -380,7 +380,7 @@ else()
      )
 endif()
 set(CMAKEPACKAGE_INSTALL_DIR
-    "${CMAKE_INSTALL_LIBDIR}/cmake/eigen3"
+    "${CMAKE_INSTALL_DATADIR}/eigen3/cmake"
    CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen3Config.cmake is installed"
    )
 set(PKGCONFIG_INSTALL_DIR
@@ -507,18 +507,89 @@ set ( EIGEN_VERSION_MINOR  ${EIGEN_MAJOR_VERSION} )
 set ( EIGEN_VERSION_PATCH  ${EIGEN_MINOR_VERSION} )
 set ( EIGEN_DEFINITIONS "")
 set ( EIGEN_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${INCLUDE_INSTALL_DIR}" )
-set ( EIGEN_INCLUDE_DIRS ${EIGEN_INCLUDE_DIR} )
 set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} )

-configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
-                 ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
-                 @ONLY ESCAPE_QUOTES
-               )
+# Interface libraries require at least CMake 3.0
+if (NOT CMAKE_VERSION VERSION_LESS 3.0)
+  include (CMakePackageConfigHelpers)

-install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
-                ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
-          DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
-        )
+  # Imported target support
+  add_library (eigen INTERFACE)
+
+  target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS})
+  target_include_directories (eigen INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:${INCLUDE_INSTALL_DIR}>
+  )
+
+  # Export as title case Eigen
+  set_target_properties (eigen PROPERTIES EXPORT_NAME Eigen)
+
+  install (TARGETS eigen EXPORT Eigen3Targets)
+
+  configure_package_config_file (
+    ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
+    ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
+    PATH_VARS EIGEN_INCLUDE_DIR EIGEN_ROOT_DIR
+    INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
+    NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
+  )
+  # Remove CMAKE_SIZEOF_VOID_P from Eigen3ConfigVersion.cmake since Eigen does
+  # not depend on architecture specific settings or libraries. More
+  # specifically, an Eigen3Config.cmake generated from a 64 bit target can be
+  # used for 32 bit targets as well (and vice versa).
+  set (_Eigen3_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P})
+  unset (CMAKE_SIZEOF_VOID_P)
+  write_basic_package_version_file (Eigen3ConfigVersion.cmake
+    VERSION ${EIGEN_VERSION_NUMBER} COMPATIBILITY SameMajorVersion)
+  set (CMAKE_SIZEOF_VOID_P ${_Eigen3_CMAKE_SIZEOF_VOID_P})
+
+  # The Eigen target will be located in the Eigen3 namespace. Other CMake
+  # targets can refer to it using Eigen3::Eigen.
+  export (TARGETS eigen NAMESPACE Eigen3:: FILE Eigen3Targets.cmake)
+  # Export Eigen3 package to CMake registry such that it can be easily found by
+  # CMake even if it has not been installed to a standard directory.
+  export (PACKAGE Eigen3)
+
+  install (EXPORT Eigen3Targets NAMESPACE Eigen3:: DESTINATION
+    ${CMAKEPACKAGE_INSTALL_DIR})
+  install (FILES
+    ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
+    ${CMAKE_CURRENT_BINARY_DIR}/Eigen3ConfigVersion.cmake
+    ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
+    DESTINATION ${CMAKEPACKAGE_INSTALL_DIR})
+else (NOT CMAKE_VERSION VERSION_LESS 3.0)
+  # Fallback to legacy Eigen3Config.cmake without the imported target
+  
+  # If CMakePackageConfigHelpers module is available (CMake >= 2.8.8)
+  # create a relocatable Config file, otherwise leave the hardcoded paths       
+  include(CMakePackageConfigHelpers OPTIONAL RESULT_VARIABLE CPCH_PATH)
+  
+  if(CPCH_PATH)
+    configure_package_config_file (
+      ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3ConfigLegacy.cmake.in
+      ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
+      PATH_VARS EIGEN_INCLUDE_DIR EIGEN_ROOT_DIR
+      INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
+      NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
+    )
+  else() 
+    # The PACKAGE_* variables are defined by the configure_package_config_file
+    # but without it we define them manually to the hardcoded paths
+    set(PACKAGE_INIT "")
+    set(PACKAGE_EIGEN_INCLUDE_DIR ${EIGEN_INCLUDE_DIR})
+    set(PACKAGE_EIGEN_ROOT_DIR ${EIGEN_ROOT_DIR})
+    configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3ConfigLegacy.cmake.in
+                     ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
+                     @ONLY ESCAPE_QUOTES
+                   )
+  endif()
+
+  install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
+                  ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
+            DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
+          )
+endif (NOT CMAKE_VERSION VERSION_LESS 3.0)

 # Add uninstall target
 add_custom_target ( uninstall
--- a/CTestConfig.cmake
+++ b/CTestConfig.cmake
@@ -4,14 +4,10 @@
 ## # The following are required to uses Dart and the Cdash dashboard
 ##   ENABLE_TESTING()
 ##   INCLUDE(CTest)
-set(CTEST_PROJECT_NAME "Eigen")
+set(CTEST_PROJECT_NAME "Eigen3.3")
 set(CTEST_NIGHTLY_START_TIME "00:00:00 UTC")

 set(CTEST_DROP_METHOD "http")
 set(CTEST_DROP_SITE "manao.inria.fr")
-set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen")
+set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen3.3")
 set(CTEST_DROP_SITE_CDASH TRUE)
-set(CTEST_PROJECT_SUBPROJECTS
-Official
-Unsupported
-)
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -14,16 +14,6 @@
 // first thing Eigen does: stop the compiler from committing suicide
 #include "src/Core/util/DisableStupidWarnings.h"

-/// This will no longer be needed after the next release of the computecppCE
-#ifdef EIGEN_USE_SYCL
-#undef min
-#undef max
-#undef isnan
-#undef isinf
-#undef isfinite
-#include <SYCL/sycl.hpp>
-#endif
-
 // Handle NVCC/CUDA/SYCL
 #if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__)
  // Do not try asserts on CUDA and SYCL!
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -351,7 +351,7 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower>
      Index ret;
      if((ret=unblocked(A11))>=0) return k+ret;
      if(rs>0) A11.adjoint().template triangularView<Upper>().template solveInPlace<OnTheRight>(A21);
-      if(rs>0) A22.template selfadjointView<Lower>().rankUpdate(A21,-1); // bottleneck
+      if(rs>0) A22.template selfadjointView<Lower>().rankUpdate(A21,typename NumTraits<RealScalar>::Literal(-1)); // bottleneck
    }
    return -1;
  }
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -55,7 +55,7 @@ template<> struct cholmod_configure_matrix<std::complex<double> > {
  * Note that the data are shared.
  */
 template<typename _Scalar, int _Options, typename _StorageIndex>
-cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_StorageIndex>& mat)
+cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> > mat)
 {
  cholmod_sparse res;
  res.nzmax   = mat.nonZeros();
@@ -104,7 +104,14 @@ cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_StorageIndex>& mat)
 template<typename _Scalar, int _Options, typename _Index>
 const cholmod_sparse viewAsCholmod(const SparseMatrix<_Scalar,_Options,_Index>& mat)
 {
-  cholmod_sparse res = viewAsCholmod(mat.const_cast_derived());
+  cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.const_cast_derived()));
+  return res;
+}
+
+template<typename _Scalar, int _Options, typename _Index>
+const cholmod_sparse viewAsCholmod(const SparseVector<_Scalar,_Options,_Index>& mat)
+{
+  cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.const_cast_derived()));
  return res;
 }

@@ -113,7 +120,7 @@ const cholmod_sparse viewAsCholmod(const SparseMatrix<_Scalar,_Options,_Index>&
 template<typename _Scalar, int _Options, typename _Index, unsigned int UpLo>
 cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<const SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
 {
-  cholmod_sparse res = viewAsCholmod(mat.matrix().const_cast_derived());
+  cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.matrix().const_cast_derived()));
  
  if(UpLo==Upper) res.stype =  1;
  if(UpLo==Lower) res.stype = -1;
@@ -298,8 +305,8 @@ class CholmodBase : public SparseSolverBase<Derived>
    }
    
    /** \internal */
-    template<typename RhsScalar, int RhsOptions, typename RhsIndex, typename DestScalar, int DestOptions, typename DestIndex>
-    void _solve_impl(const SparseMatrix<RhsScalar,RhsOptions,RhsIndex> &b, SparseMatrix<DestScalar,DestOptions,DestIndex> &dest) const
+    template<typename RhsDerived, typename DestDerived>
+    void _solve_impl(const SparseMatrixBase<RhsDerived> &b, SparseMatrixBase<DestDerived> &dest) const
    {
      eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
      const Index size = m_cholmodFactor->n;
@@ -307,7 +314,8 @@ class CholmodBase : public SparseSolverBase<Derived>
      eigen_assert(size==b.rows());

      // note: cs stands for Cholmod Sparse
-      cholmod_sparse b_cs = viewAsCholmod(b);
+      Ref<SparseMatrix<typename RhsDerived::Scalar,ColMajor,typename RhsDerived::StorageIndex> > b_ref(b.const_cast_derived());
+      cholmod_sparse b_cs = viewAsCholmod(b_ref);
      cholmod_sparse* x_cs = cholmod_spsolve(CHOLMOD_A, m_cholmodFactor, &b_cs, &m_cholmod);
      if(!x_cs)
      {
@@ -315,7 +323,7 @@ class CholmodBase : public SparseSolverBase<Derived>
        return;
      }
      // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
-      dest = viewAsEigen<DestScalar,DestOptions,DestIndex>(*x_cs);
+      dest.derived() = viewAsEigen<typename DestDerived::Scalar,ColMajor,typename DestDerived::StorageIndex>(*x_cs);
      cholmod_free_sparse(&x_cs, &m_cholmod);
    }
    #endif // EIGEN_PARSED_BY_DOXYGEN
@@ -570,7 +578,7 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
  * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
  *
  * \warning Only double precision real and complex scalar types are supported by Cholmod.
-  * 
+  *
  * \sa \ref TutorialSparseSolverConcept
  */
 template<typename _MatrixType, int _UpLo = Lower>
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -407,7 +407,7 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
                                                            : int(Kernel::AssignmentTraits::DstAlignment),
      srcAlignment = Kernel::AssignmentTraits::JointAlignment
    };
-    const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(&kernel.dstEvaluator().coeffRef(0), size);
+    const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(kernel.dstDataPtr(), size);
    const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;

    unaligned_dense_assignment_loop<dstIsAligned!=0>::run(kernel, 0, alignedStart);
@@ -527,7 +527,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
      dstAlignment = alignable ? int(requestedAlignment)
                               : int(Kernel::AssignmentTraits::DstAlignment)
    };
-    const Scalar *dst_ptr = &kernel.dstEvaluator().coeffRef(0,0);
+    const Scalar *dst_ptr = kernel.dstDataPtr();
    if((!bool(dstIsAligned)) && (UIntPtr(dst_ptr) % sizeof(Scalar))>0)
    {
      // the pointer is not aligend-on scalar, so alignment is not possible
@@ -683,6 +683,11 @@ public:
      : int(DstEvaluatorType::Flags)&RowMajorBit ? inner
      : outer;
  }
+
+  EIGEN_DEVICE_FUNC const Scalar* dstDataPtr() const
+  {
+    return m_dstExpr.data();
+  }
  
 protected:
  DstEvaluatorType& m_dst;
@@ -876,6 +881,34 @@ struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
    src.evalTo(dst);
  }
+
+  // NOTE The following two functions are templated to avoid their instanciation if not needed
+  //      This is needed because some expressions supports evalTo only and/or have 'void' as scalar type.
+  template<typename SrcScalarType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,SrcScalarType> &/*func*/)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    src.addTo(dst);
+  }
+
+  template<typename SrcScalarType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,SrcScalarType> &/*func*/)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    src.subTo(dst);
+  }
 };

 } // namespace internal
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -46,7 +46,7 @@ struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
  typedef typename remove_reference<LhsNested>::type _LhsNested;
  typedef typename remove_reference<RhsNested>::type _RhsNested;
  enum {
-    Flags = _LhsNested::Flags & RowMajorBit
+    Flags = cwise_promote_storage_order<typename traits<Lhs>::StorageKind,typename traits<Rhs>::StorageKind,_LhsNested::Flags & RowMajorBit,_RhsNested::Flags & RowMajorBit>::value
  };
 };
 } // end namespace internal
@@ -84,6 +84,7 @@ class CwiseBinaryOp :
 {
  public:
    
+    typedef typename internal::remove_all<BinaryOp>::type Functor;
    typedef typename internal::remove_all<LhsType>::type Lhs;
    typedef typename internal::remove_all<RhsType>::type Rhs;

--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -51,7 +51,8 @@ struct dot_nocheck<T, U, true>

 } // end namespace internal

-/** \returns the dot product of *this with other.
+/** \fn MatrixBase::dot
+  * \returns the dot product of *this with other.
  *
  * \only_for_vectors
  *
@@ -70,9 +71,11 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived,OtherDerived)
+#if !(defined(EIGEN_NO_STATIC_ASSERT) && defined(EIGEN_NO_DEBUG))
  typedef internal::scalar_conj_product_op<Scalar,typename OtherDerived::Scalar> func;
  EIGEN_CHECK_BINARY_COMPATIBILIY(func,Scalar,typename OtherDerived::Scalar);
-
+#endif
+  
  eigen_assert(size() == other.size());

  return internal::dot_nocheck<Derived,OtherDerived>::run(*this, other);
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -25,7 +25,8 @@ template<int Rows, int Cols, int Depth> struct product_type_selector;
 template<int Size, int MaxSize> struct product_size_category
 {
  enum { is_large = MaxSize == Dynamic ||
-                    Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD,
+                    Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
+                    (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
         value = is_large  ? Large
               : Size == 1 ? 1
                           : Small
@@ -223,50 +224,65 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
      // on, the other hand it is good for the cache to pack the vector anyways...
      EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime==1),
      ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
-      MightCannotUseDest = (ActualDest::InnerStrideAtCompileTime!=1) || ComplexByReal
+      MightCannotUseDest = (!EvalToDestAtCompileTime) || ComplexByReal
    };

-    gemv_static_vector_if<ResScalar,ActualDest::SizeAtCompileTime,ActualDest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
-
-    const bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
-    const bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
-
-    RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
-
-    ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
-                                                  evalToDest ? dest.data() : static_dest.data());
-
-    if(!evalToDest)
-    {
-      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      Index size = dest.size();
-      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      #endif
-      if(!alphaIsCompatible)
-      {
-        MappedDest(actualDestPtr, dest.size()).setZero();
-        compatibleAlpha = RhsScalar(1);
-      }
-      else
-        MappedDest(actualDestPtr, dest.size()) = dest;
-    }
-
    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
-    general_matrix_vector_product
-        <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
-        actualLhs.rows(), actualLhs.cols(),
-        LhsMapper(actualLhs.data(), actualLhs.outerStride()),
-        RhsMapper(actualRhs.data(), actualRhs.innerStride()),
-        actualDestPtr, 1,
-        compatibleAlpha);
+    RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);

-    if (!evalToDest)
+    if(!MightCannotUseDest)
    {
-      if(!alphaIsCompatible)
-        dest += actualAlpha * MappedDest(actualDestPtr, dest.size());
-      else
-        dest = MappedDest(actualDestPtr, dest.size());
+      // shortcut if we are sure to be able to use dest directly,
+      // this ease the compiler to generate cleaner and more optimzized code for most common cases
+      general_matrix_vector_product
+          <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
+          actualLhs.rows(), actualLhs.cols(),
+          LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+          RhsMapper(actualRhs.data(), actualRhs.innerStride()),
+          dest.data(), 1,
+          compatibleAlpha);
+    }
+    else
+    {
+      gemv_static_vector_if<ResScalar,ActualDest::SizeAtCompileTime,ActualDest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
+
+      const bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
+      const bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
+
+      ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
+                                                    evalToDest ? dest.data() : static_dest.data());
+
+      if(!evalToDest)
+      {
+        #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+        Index size = dest.size();
+        EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+        #endif
+        if(!alphaIsCompatible)
+        {
+          MappedDest(actualDestPtr, dest.size()).setZero();
+          compatibleAlpha = RhsScalar(1);
+        }
+        else
+          MappedDest(actualDestPtr, dest.size()) = dest;
+      }
+
+      general_matrix_vector_product
+          <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
+          actualLhs.rows(), actualLhs.cols(),
+          LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+          RhsMapper(actualRhs.data(), actualRhs.innerStride()),
+          actualDestPtr, 1,
+          compatibleAlpha);
+
+      if (!evalToDest)
+      {
+        if(!alphaIsCompatible)
+          dest.matrix() += actualAlpha * MappedDest(actualDestPtr, dest.size());
+        else
+          dest = MappedDest(actualDestPtr, dest.size());
+      }
    }
  }
 };
@@ -329,6 +345,7 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,false>
  template<typename Lhs, typename Rhs, typename Dest>
  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
  {
+    EIGEN_STATIC_ASSERT((!nested_eval<Lhs,1>::Evaluate),EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE);
    // TODO if rhs is large enough it might be beneficial to make sure that dest is sequentially stored in memory, otherwise use a temp
    typename nested_eval<Rhs,1>::type actual_rhs(rhs);
    const Index size = rhs.rows();
@@ -342,6 +359,7 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
  template<typename Lhs, typename Rhs, typename Dest>
  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
  {
+    EIGEN_STATIC_ASSERT((!nested_eval<Lhs,1>::Evaluate),EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE);
    typename nested_eval<Rhs,Lhs::RowsAtCompileTime>::type actual_rhs(rhs);
    const Index rows = dest.rows();
    for(Index i=0; i<rows; ++i)
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@@ -105,7 +105,7 @@ class WithFormat
    }

  protected:
-    const typename ExpressionType::Nested m_matrix;
+    typename ExpressionType::Nested m_matrix;
    IOFormat m_format;
 };

--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h
@@ -45,6 +45,7 @@ class Inverse : public InverseImpl<XprType,typename internal::traits<XprType>::S
 public:
  typedef typename XprType::StorageIndex StorageIndex;
  typedef typename XprType::PlainObject                       PlainObject;
+  typedef typename XprType::Scalar                            Scalar;
  typedef typename internal::ref_selector<XprType>::type      XprTypeNested;
  typedef typename internal::remove_all<XprTypeNested>::type  XprTypeNestedCleaned;
  typedef typename internal::ref_selector<Inverse>::type Nested;
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -58,6 +58,28 @@ template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers> struct m

 } // end namespace internal

+#ifdef EIGEN_PARSED_BY_DOXYGEN
+namespace doxygen {
+
+// This is a workaround to doxygen not being able to understand the inheritance logic
+// when it is hidden by the dense_xpr_base helper struct.
+// Moreover, doxygen fails to include members that are not documented in the declaration body of
+// MatrixBase if we inherits MatrixBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >,
+// this is why we simply inherits MatrixBase, though this does not make sense.
+
+/** This class is just a workaround for Doxygen and it does not not actually exist. */
+template<typename Derived> struct dense_xpr_base_dispatcher;
+/** This class is just a workaround for Doxygen and it does not not actually exist. */
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct dense_xpr_base_dispatcher<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+    : public MatrixBase {};
+/** This class is just a workaround for Doxygen and it does not not actually exist. */
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct dense_xpr_base_dispatcher<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+    : public ArrayBase {};
+
+} // namespace doxygen
+
 /** \class PlainObjectBase
  * \ingroup Core_Module
  * \brief %Dense storage base class for matrices and arrays.
@@ -65,26 +87,10 @@ template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers> struct m
  * This class can be extended with the help of the plugin mechanism described on the page
  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_PLAINOBJECTBASE_PLUGIN.
  *
+  * \tparam Derived is the derived type, e.g., a Matrix or Array
+  *
  * \sa \ref TopicClassHierarchy
  */
-#ifdef EIGEN_PARSED_BY_DOXYGEN
-namespace doxygen {
-
-// this is a workaround to doxygen not being able to understand the inheritance logic
-// when it is hidden by the dense_xpr_base helper struct.
-/** This class is just a workaround for Doxygen and it does not not actually exist. */
-template<typename Derived> struct dense_xpr_base_dispatcher;
-/** This class is just a workaround for Doxygen and it does not not actually exist. */
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct dense_xpr_base_dispatcher<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-    : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > {};
-/** This class is just a workaround for Doxygen and it does not not actually exist. */
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct dense_xpr_base_dispatcher<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-    : public ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > {};
-
-} // namespace doxygen
-
 template<typename Derived>
 class PlainObjectBase : public doxygen::dense_xpr_base_dispatcher<Derived>
 #else
@@ -554,7 +560,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type

  public:

-    /** \copydoc DenseBase::operator=(const EigenBase<OtherDerived>&)
+    /** \brief Copies the generic expression \a other into *this.
+      * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
      */
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC 
@@ -763,6 +770,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    {
      // NOTE MSVC 2008 complains if we directly put bool(NumTraits<T>::IsInteger) as the EIGEN_STATIC_ASSERT argument.
      const bool is_integer = NumTraits<T>::IsInteger;
+      EIGEN_UNUSED_VARIABLE(is_integer);
      EIGEN_STATIC_ASSERT(is_integer,
                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
      resize(size);
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -158,10 +158,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<
  static EIGEN_STRONG_INLINE
  void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &)
  {
-    Index dstRows = src.rows();
-    Index dstCols = src.cols();
-    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
-      dst.resize(dstRows, dstCols);
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
    // FIXME shall we handle nested_eval here?
    generic_product_impl<Lhs, Rhs>::addTo(dst, src.lhs(), src.rhs());
  }
@@ -176,10 +173,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<
  static EIGEN_STRONG_INLINE
  void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &)
  {
-    Index dstRows = src.rows();
-    Index dstCols = src.cols();
-    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
-      dst.resize(dstRows, dstCols);
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
    // FIXME shall we handle nested_eval here?
    generic_product_impl<Lhs, Rhs>::subTo(dst, src.lhs(), src.rhs());
  }
@@ -366,17 +360,21 @@ template<typename Lhs, typename Rhs>
 struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct> >
 {
+  typedef typename nested_eval<Lhs,1>::type LhsNested;
+  typedef typename nested_eval<Rhs,1>::type RhsNested;
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
  enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight };
-  typedef typename internal::conditional<int(Side)==OnTheRight,Lhs,Rhs>::type MatrixType;
+  typedef typename internal::remove_all<typename internal::conditional<int(Side)==OnTheRight,LhsNested,RhsNested>::type>::type MatrixType;

  template<typename Dest>
  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
+    LhsNested actual_lhs(lhs);
+    RhsNested actual_rhs(rhs);
    internal::gemv_dense_selector<Side,
                            (int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor,
                            bool(internal::blas_traits<MatrixType>::HasUsableDirectAccess)
-                           >::run(lhs, rhs, dst, alpha);
+                           >::run(actual_lhs, actual_rhs, dst, alpha);
  }
 };

--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -45,7 +45,7 @@ struct traits<SelfAdjointView<MatrixType, UpLo> > : traits<MatrixType>
 };
 }

-// FIXME could also be called SelfAdjointWrapper to be consistent with DiagonalWrapper ??
+
 template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
  : public TriangularBase<SelfAdjointView<_MatrixType, UpLo> >
 {
@@ -60,10 +60,12 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
    /** \brief The type of coefficients in this matrix */
    typedef typename internal::traits<SelfAdjointView>::Scalar Scalar; 
    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;

    enum {
      Mode = internal::traits<SelfAdjointView>::Mode,
-      Flags = internal::traits<SelfAdjointView>::Flags
+      Flags = internal::traits<SelfAdjointView>::Flags,
+      TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0)
    };
    typedef typename MatrixType::PlainObject PlainObject;

@@ -187,6 +189,36 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
                                   TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type(tmp2);
    }

+    typedef SelfAdjointView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
+    /** \sa MatrixBase::conjugate() const */
+    EIGEN_DEVICE_FUNC
+    inline const ConjugateReturnType conjugate() const
+    { return ConjugateReturnType(m_matrix.conjugate()); }
+
+    typedef SelfAdjointView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
+    /** \sa MatrixBase::adjoint() const */
+    EIGEN_DEVICE_FUNC
+    inline const AdjointReturnType adjoint() const
+    { return AdjointReturnType(m_matrix.adjoint()); }
+
+    typedef SelfAdjointView<typename MatrixType::TransposeReturnType,TransposeMode> TransposeReturnType;
+     /** \sa MatrixBase::transpose() */
+    EIGEN_DEVICE_FUNC
+    inline TransposeReturnType transpose()
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
+      typename MatrixType::TransposeReturnType tmp(m_matrix);
+      return TransposeReturnType(tmp);
+    }
+
+    typedef SelfAdjointView<const typename MatrixType::ConstTransposeReturnType,TransposeMode> ConstTransposeReturnType;
+    /** \sa MatrixBase::transpose() const */
+    EIGEN_DEVICE_FUNC
+    inline const ConstTransposeReturnType transpose() const
+    {
+      return ConstTransposeReturnType(m_matrix.transpose());
+    }
+
    /** \returns a const expression of the main diagonal of the matrix \c *this
      *
      * This method simply returns the diagonal of the nested expression, thus by-passing the SelfAdjointView decorator.
@@ -287,6 +319,7 @@ public:
 * Implementation of MatrixBase methods
 ***************************************************************************/

+/** This is the const version of MatrixBase::selfadjointView() */
 template<typename Derived>
 template<unsigned int UpLo>
 typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
@@ -295,6 +328,15 @@ MatrixBase<Derived>::selfadjointView() const
  return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
 }

+/** \returns an expression of a symmetric/self-adjoint view extracted from the upper or lower triangular part of the current matrix
+  *
+  * The parameter \a UpLo can be either \c #Upper or \c #Lower
+  *
+  * Example: \include MatrixBase_selfadjointView.cpp
+  * Output: \verbinclude MatrixBase_selfadjointView.out
+  *
+  * \sa class SelfAdjointView
+  */
 template<typename Derived>
 template<unsigned int UpLo>
 typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -161,6 +161,7 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
 * TriangularView methods
 ***************************************************************************/

+#ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatrixType, unsigned int Mode>
 template<int Side, typename OtherDerived>
 void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
@@ -188,6 +189,7 @@ TriangularViewImpl<Derived,Mode,Dense>::solve(const MatrixBase<Other>& other) co
 {
  return internal::triangular_solve_retval<Side,TriangularViewType,Other>(derived(), other.derived());
 }
+#endif

 namespace internal {

--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -470,6 +470,8 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
      * \a Side==OnTheLeft (the default), or the right-inverse-multiply  \a other * inverse(\c *this) if
      * \a Side==OnTheRight.
      *
+      * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
+      *
      * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
      * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
      * is an upper (resp. lower) triangular matrix.
@@ -495,6 +497,8 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
      * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
      * This function will const_cast it, so constness isn't honored here.
      *
+      * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
+      *
      * See TriangularView:solve() for the details.
      */
    template<int Side, typename OtherDerived>
@@ -539,13 +543,14 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat

    template<typename ProductType>
    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha);
+    EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha, bool beta);
 };

 /***************************************************************************
 * Implementation of triangular evaluation/assignment
 ***************************************************************************/

+#ifndef EIGEN_PARSED_BY_DOXYGEN
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
@@ -583,6 +588,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBas
  eigen_assert(Mode == int(OtherDerived::Mode));
  internal::call_assignment_no_alias(derived(), other.derived());
 }
+#endif

 /***************************************************************************
 * Implementation of TriangularBase methods
@@ -944,8 +950,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_
    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
      dst.resize(dstRows, dstCols);

-    dst.setZero();
-    dst._assignProduct(src, 1);
+    dst._assignProduct(src, 1, 0);
  }
 };

@@ -956,7 +961,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_ass
  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,typename SrcXprType::Scalar> &)
  {
-    dst._assignProduct(src, 1);
+    dst._assignProduct(src, 1, 1);
  }
 };

@@ -967,7 +972,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::sub_ass
  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,typename SrcXprType::Scalar> &)
  {
-    dst._assignProduct(src, -1);
+    dst._assignProduct(src, -1, 1);
  }
 };

--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -194,7 +194,8 @@ struct functor_traits<max_coeff_visitor<Scalar> > {

 } // end namespace internal

-/** \returns the minimum of all coefficients of *this and puts in *row and *col its location.
+/** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
+  * \returns the minimum of all coefficients of *this and puts in *row and *col its location.
  * \warning the result is undefined if \c *this contains NaN.
  *
  * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
@@ -230,7 +231,8 @@ DenseBase<Derived>::minCoeff(IndexType* index) const
  return minVisitor.res;
 }

-/** \returns the maximum of all coefficients of *this and puts in *row and *col its location.
+/** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
+  * \returns the maximum of all coefficients of *this and puts in *row and *col its location.
  * \warning the result is undefined if \c *this contains NaN. 
  *
  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -395,14 +395,11 @@ template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs)

 template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
 {
-  Packet8f tmp0 = _mm256_hadd_ps(a,_mm256_permute2f128_ps(a,a,1));
-  tmp0 = _mm256_hadd_ps(tmp0,tmp0);
-  return pfirst(_mm256_hadd_ps(tmp0, tmp0));
+  return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1))));
 }
 template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
 {
-  Packet4d tmp0 = _mm256_hadd_pd(a,_mm256_permute2f128_pd(a,a,1));
-  return pfirst(_mm256_hadd_pd(tmp0,tmp0));
+  return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
 }

 template<> EIGEN_STRONG_INLINE Packet4f predux_downto4<Packet8f>(const Packet8f& a)
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -15,14 +15,14 @@ namespace Eigen {

 namespace internal {

-static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_ZERO_);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
 #ifdef __VSX__
 #if defined(_BIG_ENDIAN)
-static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
-static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 #else
-static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
-static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 #endif
 #endif

--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@@ -84,8 +84,10 @@ static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
 static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
 static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);

+#ifdef __POWER8_VECTOR__
 static Packet2l p2l_1023 = { 1023, 1023 };
 static Packet2ul p2ul_52 = { 52, 52 };
+#endif

 #endif

--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -72,7 +72,7 @@ static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
-static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
+static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
 #ifndef __VSX__
 static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
 #endif
@@ -358,7 +358,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_
 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }

-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_ZERO); }
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); }
 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return a * b; }

 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
@@ -373,7 +373,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const
  t   = vec_nmsub(y_0, b, p4f_ONE);
  y_1 = vec_madd(y_0, t, y_0);

-  return vec_madd(a, y_1, p4f_ZERO);
+  return vec_madd(a, y_1, p4f_MZERO);
 #else
  return vec_div(a, b);
 #endif
@@ -766,7 +766,7 @@ static Packet2l  p2l_ONE  = { 1, 1 };
 static Packet2l  p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
 static Packet2d  p2d_ONE  = { 1.0, 1.0 }; 
 static Packet2d  p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
-static Packet2d  p2d_ZERO_ = { -0.0, -0.0 };
+static Packet2d  p2d_MZERO = { -0.0, -0.0 };

 #ifdef _BIG_ENDIAN
 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
@@ -904,7 +904,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_

 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }

-template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_ZERO); }
+template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_MZERO); }
 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); }

 // for some weird raisons, it has to be overloaded for packet of integers
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -28,11 +28,13 @@ namespace internal {
 #define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
 #endif

-// FIXME NEON has 16 quad registers, but since the current register allocator
-// is so bad, it is much better to reduce it to 8
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#if EIGEN_ARCH_ARM64
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+#else
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 
 #endif
+#endif

 typedef float32x2_t Packet2f;
 typedef float32x4_t Packet4f;
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -28,7 +28,7 @@ namespace internal {
 #endif
 #endif

-#if (defined EIGEN_VECTORIZE_AVX) && EIGEN_COMP_GNUC_STRICT && (__GXX_ABI_VERSION < 1004)
+#if (defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)
 // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
 // have overloads for both types without linking error.
 // One solution is to increase ABI version using -fabi-version=4 (or greater).
@@ -504,30 +504,13 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
 {
  return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
 }
+
 template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 {
  return _mm_hadd_pd(vecs[0], vecs[1]);
 }

-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
-  Packet4f tmp0 = _mm_hadd_ps(a,a);
-  return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0));
-}
-
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst<Packet2d>(_mm_hadd_pd(a, a)); }
 #else
-// SSE2 versions
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
-  Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
-  return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
-}
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
-{
-  return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
-}
-
 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
 {
  Packet4f tmp0, tmp1, tmp2;
@@ -548,6 +531,29 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 }
 #endif  // SSE3

+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
+  // (from Nehalem to Haswell)
+// #ifdef EIGEN_VECTORIZE_SSE3
+//   Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3));
+//   return pfirst<Packet4f>(_mm_hadd_ps(tmp, tmp));
+// #else
+  Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
+  return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+// #endif
+}
+
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
+{
+  // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
+  // (from Nehalem to Haswell)
+// #ifdef EIGEN_VECTORIZE_SSE3
+//   return pfirst<Packet2d>(_mm_hadd_pd(a, a));
+// #else
+  return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
+// #endif
+}

 #ifdef EIGEN_VECTORIZE_SSSE3
 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
--- a/Eigen/src/Core/arch/ZVector/Complex.h
+++ b/Eigen/src/Core/arch/ZVector/Complex.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -24,13 +25,48 @@ struct Packet1cd
  Packet2d v;
 };

+struct Packet2cf
+{
+  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+  union {
+    Packet4f v;
+    Packet1cd cd[2];
+  };
+};
+
+template<> struct packet_traits<std::complex<float> >  : default_packet_traits
+{
+  typedef Packet2cf type;
+  typedef Packet2cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasBlend  = 1,
+    HasSetLinear = 0
+  };
+};
+
+
 template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 {
  typedef Packet1cd type;
  typedef Packet1cd half;
  enum {
    Vectorizable = 1,
-    AlignedOnScalar = 0,
+    AlignedOnScalar = 1,
    size = 1,
    HasHalfPacket = 0,

@@ -47,20 +83,68 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
  };
 };

+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float>  type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
 template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };

+/* Forward declaration */
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel);
+
+template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }

 template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
 { /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }

+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
+{
+  Packet2cf res;
+  res.cd[0] = Packet1cd(vec_ld2f((const float *)&from));
+  res.cd[1] = res.cd[0];
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+{
+  std::complex<float> EIGEN_ALIGN16 af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  return pload<Packet2cf>(af);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride EIGEN_UNUSED)
+{
+  return pload<Packet1cd>(from);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
+{
+  std::complex<float> EIGEN_ALIGN16 af[2];
+  pstore<std::complex<float> >((std::complex<float> *) af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride EIGEN_UNUSED)
+{
+  pstore<std::complex<double> >(to, from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v, b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v, b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
+{
+  Packet2cf res;
+  res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0]))).v;
+  res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1]))).v;
+  return res;
+}

 template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
@@ -79,43 +163,90 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con

  return Packet1cd(v1 + v2);
 }
-
-template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from)
+template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
-  return pset1<Packet1cd>(*from);
+  Packet2cf res;
+  res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[0]))).v;
+  res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[1]))).v;
+  return res;
 }

+template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v,b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from) {  return pset1<Packet1cd>(*from); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*      from) {  return pset1<Packet2cf>(*from); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *     addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }

 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {
-  std::complex<double> EIGEN_ALIGN16 res[2];
-  pstore<std::complex<double> >(res, a);
+  std::complex<double> EIGEN_ALIGN16 res;
+  pstore<std::complex<double> >(&res, a);
+
+  return res;
+}
+template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> EIGEN_ALIGN16 res[2];
+  pstore<std::complex<float> >(res, a);

  return res[0];
 }

 template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
+{
+  Packet2cf res;
+  res.cd[0] = a.cd[1];
+  res.cd[1] = a.cd[0];
+  return res;
+}

 template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
 {
  return pfirst(a);
 }
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> res;
+  Packet1cd b = padd<Packet1cd>(a.cd[0], a.cd[1]);
+  vec_st2f(b.v, (float*)&res);
+  return res;
+}

 template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
 {
  return vecs[0];
 }
+template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
+{
+  PacketBlock<Packet2cf,2> transpose;
+  transpose.packet[0] = vecs[0];
+  transpose.packet[1] = vecs[1];
+  ptranspose(transpose);
+
+  return padd<Packet2cf>(transpose.packet[0], transpose.packet[1]);
+} 

 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
 {
  return pfirst(a);
 }
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> res;
+  Packet1cd b = pmul<Packet1cd>(a.cd[0], a.cd[1]);
+  vec_st2f(b.v, (float*)&res);
+  return res;
+}

 template<int Offset>
 struct palign_impl<Offset,Packet1cd>
@@ -127,6 +258,18 @@ struct palign_impl<Offset,Packet1cd>
  }
 };

+template<int Offset>
+struct palign_impl<Offset,Packet2cf>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
+  {
+    if (Offset == 1) {
+      first.cd[0] = first.cd[1];
+      first.cd[1] = second.cd[0];
+    }
+  }
+};
+
 template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
 {
  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
@@ -160,6 +303,39 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
  }
 };

+template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
  // TODO optimize it for AltiVec
@@ -168,17 +344,49 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, con
  return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64)));
 }

+template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for AltiVec
+  Packet2cf res;
+  res.cd[0] = pdiv<Packet1cd>(a.cd[0], b.cd[0]);
+  res.cd[1] = pdiv<Packet1cd>(a.cd[1], b.cd[1]);
+  return res;
+}
+
 EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
 {
  return Packet1cd(preverse(Packet2d(x.v)));
 }

+EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
+{
+  Packet2cf res;
+  res.cd[0] = pcplxflip(x.cd[0]);
+  res.cd[1] = pcplxflip(x.cd[1]);
+  return res;
+}
+
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
 {
  Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
  kernel.packet[0].v = tmp;
 }
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
+{
+  Packet1cd tmp = kernel.packet[0].cd[1];
+  kernel.packet[0].cd[1] = kernel.packet[1].cd[0];
+  kernel.packet[1].cd[0] = tmp;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+  Packet2cf result;
+  const Selector<4> ifPacket4 = { ifPacket.select[0], ifPacket.select[0], ifPacket.select[1], ifPacket.select[1] };
+  result.v = pblend<Packet4f>(ifPacket4, thenPacket.v, elsePacket.v);
+  return result;
+}
+
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/arch/ZVector/MathFunctions.h
+++ b/Eigen/src/Core/arch/ZVector/MathFunctions.h
@@ -3,6 +3,7 @@
 //
 // Copyright (C) 2007 Julien Pommier
 // Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -19,32 +20,32 @@ namespace Eigen {

 namespace internal {

+static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
+static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
+static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
+
+static _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
+static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
+
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d pexp<Packet2d>(const Packet2d& _x)
 {
  Packet2d x = _x;

-  _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
-  _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
-  _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
-
-  _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
-  _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
-
  Packet2d tmp, fx;
  Packet2l emm0;

@@ -91,18 +92,44 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
                 isnumber_mask);
 }

+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pexp<Packet4f>(const Packet4f& x)
+{
+  Packet4f res;
+  res.v4f[0] = pexp<Packet2d>(x.v4f[0]);
+  res.v4f[1] = pexp<Packet2d>(x.v4f[1]);
+  return res;
+}
+
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d psqrt<Packet2d>(const Packet2d& x)
 {
  return  __builtin_s390_vfsqdb(x);
 }

+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psqrt<Packet4f>(const Packet4f& x)
+{
+  Packet4f res;
+  res.v4f[0] = psqrt<Packet2d>(x.v4f[0]);
+  res.v4f[1] = psqrt<Packet2d>(x.v4f[1]);
+  return res;
+}
+
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d prsqrt<Packet2d>(const Packet2d& x) {
  // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation.
  return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
 }

+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& x) {
+  Packet4f res;
+  res.v4f[0] = prsqrt<Packet2d>(x.v4f[0]);
+  res.v4f[1] = prsqrt<Packet2d>(x.v4f[1]);
+  return res;
+}
+
 }  // end namespace internal

 }  // end namespace Eigen
--- a/Eigen/src/Core/arch/ZVector/PacketMath.h
+++ b/Eigen/src/Core/arch/ZVector/PacketMath.h
@@ -28,9 +28,8 @@ namespace internal {
 #define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
 #endif

-// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  32
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  16
 #endif

 typedef __vector int                 Packet4i;
@@ -42,6 +41,10 @@ typedef __vector double              Packet2d;
 typedef __vector unsigned long long  Packet2ul;
 typedef __vector long long           Packet2l;

+typedef struct {
+	Packet2d  v4f[2];
+} Packet4f;
+
 typedef union {
  int32_t   i[4];
  uint32_t ui[4];
@@ -88,6 +91,7 @@ static Packet2d p2d_ONE = { 1.0, 1.0 };
 static Packet2d p2d_ZERO_ = { -0.0, -0.0 };

 static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
+static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8));

 static Packet16uc p16uc_PSET64_HI = { 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
@@ -132,13 +136,11 @@ template<> struct packet_traits<int>    : default_packet_traits
  typedef Packet4i type;
  typedef Packet4i half;
  enum {
-    // FIXME check the Has*
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size = 4,
    HasHalfPacket = 0,

-    // FIXME check the Has*
    HasAdd  = 1,
    HasSub  = 1,
    HasMul  = 1,
@@ -147,6 +149,37 @@ template<> struct packet_traits<int>    : default_packet_traits
  };
 };

+template<> struct packet_traits<float> : default_packet_traits
+{
+  typedef Packet4f type;
+  typedef Packet4f half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=4,
+    HasHalfPacket = 0,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 1,
+    HasMin  = 1,
+    HasMax  = 1,
+    HasAbs  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasNegate = 1,
+    HasBlend = 1
+  };
+};
+
 template<> struct packet_traits<double> : default_packet_traits
 {
  typedef Packet2d type;
@@ -157,7 +190,6 @@ template<> struct packet_traits<double> : default_packet_traits
    size=2,
    HasHalfPacket = 1,

-    // FIXME check the Has*
    HasAdd  = 1,
    HasSub  = 1,
    HasMul  = 1,
@@ -180,8 +212,12 @@ template<> struct packet_traits<double> : default_packet_traits
 };

 template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
 template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };

+/* Forward declaration */
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel);
+ 
 inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
 {
  Packet vt;
@@ -222,6 +258,32 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
  return s;
 }

+/* Helper function to simulate a vec_splat_packet4f
+ */
+template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f&   from)
+{
+  Packet4f splat;
+  switch (element) {
+  case 0:
+    splat.v4f[0] = vec_splat(from.v4f[0], 0);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  case 1:
+    splat.v4f[0] = vec_splat(from.v4f[0], 1);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  case 2:
+    splat.v4f[0] = vec_splat(from.v4f[1], 0);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  case 3:
+    splat.v4f[0] = vec_splat(from.v4f[1], 1);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  }
+  return splat;
+}
+
 template<int Offset>
 struct palign_impl<Offset,Packet4i>
 {
@@ -238,6 +300,31 @@ struct palign_impl<Offset,Packet4i>
  }
 };

+/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double
+ */
+template<int Offset>
+struct palign_impl<Offset,Packet4f>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
+  {
+    switch (Offset % 4) {
+    case 1:
+      first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8);
+      first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8);
+      break;
+    case 2:
+      first.v4f[0] = first.v4f[1];
+      first.v4f[1] = second.v4f[0];
+      break;
+    case 3:
+      first.v4f[0] = vec_sld(first.v4f[1],  second.v4f[0], 8);
+      first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8);
+      break;
+    }
+  }
+};
+
+
 template<int Offset>
 struct palign_impl<Offset,Packet2d>
 {
@@ -257,6 +344,16 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
  return vfrom->v4i;
 }

+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_LOAD
+  Packet4f vfrom;
+  vfrom.v4f[0] = vec_ld2f(&from[0]);
+  vfrom.v4f[1] = vec_ld2f(&from[2]);
+  return vfrom;
+}
+
 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
 {
  // FIXME: No intrinsic yet
@@ -275,6 +372,15 @@ template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& f
  vto->v4i = from;
 }

+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_st2f(from.v4f[0], &to[0]);
+  vec_st2f(from.v4f[1], &to[2]);
+}
+
+
 template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
 {
  // FIXME: No intrinsic yet
@@ -288,10 +394,16 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)
 {
  return vec_splats(from);
 }
-
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
  return vec_splats(from);
 }
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&    from)
+{
+  Packet4f to;
+  to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
+  to.v4f[1] = to.v4f[0];
+  return to;
+}

 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet4i>(const int *a,
@@ -304,6 +416,17 @@ pbroadcast4<Packet4i>(const int *a,
  a3 = vec_splat(a3, 3);
 }

+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  a3 = pload<Packet4f>(a);
+  a0 = vec_splat_packet4f<0>(a3);
+  a1 = vec_splat_packet4f<1>(a3);
+  a2 = vec_splat_packet4f<2>(a3);
+  a3 = vec_splat_packet4f<3>(a3);
+}
+
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet2d>(const double *a,
                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
@@ -326,6 +449,16 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* f
 return pload<Packet4i>(ai);
 }

+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+  float EIGEN_ALIGN16 ai[4];
+  ai[0] = from[0*stride];
+  ai[1] = from[1*stride];
+  ai[2] = from[2*stride];
+  ai[3] = from[3*stride];
+ return pload<Packet4f>(ai);
+}
+
 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
  double EIGEN_ALIGN16 af[2];
@@ -344,6 +477,16 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
  to[3*stride] = ai[3];
 }

+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  float EIGEN_ALIGN16 ai[4];
+  pstore<float>((float *)ai, from);
+  to[0*stride] = ai[0];
+  to[1*stride] = ai[1];
+  to[2*stride] = ai[2];
+  to[3*stride] = ai[3];
+}
+
 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
 {
  double EIGEN_ALIGN16 af[2];
@@ -353,52 +496,160 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to,
 }

 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a + b); }
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] + b.v4f[0];
+  c.v4f[1] = a.v4f[1] + b.v4f[1];
+  return c;
+}
 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a + b); }

 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a - b); }
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] - b.v4f[0];
+  c.v4f[1] = a.v4f[1] - b.v4f[1];
+  return c;
+}
 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a - b); }

 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a * b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] * b.v4f[0];
+  c.v4f[1] = a.v4f[1] * b.v4f[1];
+  return c;
+}
 template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a * b); }

 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a / b); }
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] / b.v4f[0];
+  c.v4f[1] = a.v4f[1] / b.v4f[1];
+  return c;
+}
 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a / b); }

 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); }
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
+{
+  Packet4f c;
+  c.v4f[0] = -a.v4f[0];
+  c.v4f[1] = -a.v4f[1];
+  return c;
+}
 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); }

 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }

 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd<Packet4i>(pmul<Packet4i>(a, b), c); }
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
+{
+  Packet4f res;
+  res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
+  res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
+  return res;
+}
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }

 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)    { return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)  { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN); }

 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
+  return res;
+}

 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
+  return res;
+}

 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
+  return res;
+}

 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
+  return res;
+}

 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
+  return res;
+}

 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return pand<Packet4i>(a, vec_nor(b, b)); }
 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
+  return res;
+}

+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = vec_round(a.v4f[0]);
+  res.v4f[1] = vec_round(a.v4f[1]);
+  return res;
+}
 template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = vec_ceil(a.v4f[0]);
+  res.v4f[1] = vec_ceil(a.v4f[1]);
+  return res;
+}
 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const  Packet2d& a) { return vec_ceil(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = vec_floor(a.v4f[0]);
+  res.v4f[1] = vec_floor(a.v4f[1]);
+  return res;
+}
 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }

 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*       from) { return pload<Packet4i>(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*     from) { return pload<Packet4f>(from); }
 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double*    from) { return pload<Packet2d>(from); }


@@ -408,6 +659,14 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
 }

+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*    from)
+{
+  Packet4f p = pload<Packet4f>(from);
+  p.v4f[1] = vec_splat(p.v4f[0], 1);
+  p.v4f[0] = vec_splat(p.v4f[0], 0);
+  return p;
+}
+
 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
 {
  Packet2d p = pload<Packet2d>(from);
@@ -415,12 +674,15 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
 }

 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*        to, const Packet4i& from) { pstore<int>(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*    to, const Packet4f& from) { pstore<float>(to, from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from) { pstore<double>(to, from); }

 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }

 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int    EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float  EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }

 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
@@ -433,8 +695,23 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
  return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
 }

-template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{
+  Packet4f rev;
+  rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
+  rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
+  return rev;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = pabs(a.v4f[0]);
+  res.v4f[1] = pabs(a.v4f[1]);
+  return res;
+}

 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
@@ -453,6 +730,13 @@ template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
  sum = padd<Packet2d>(a, b);
  return pfirst(sum);
 }
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  Packet2d sum;
+  sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
+  double first = predux<Packet2d>(sum);
+  return static_cast<float>(first);
+}

 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
 {
@@ -493,6 +777,21 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
  return sum;
 }

+template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
+{
+  PacketBlock<Packet4f,4> transpose;
+  transpose.packet[0] = vecs[0];
+  transpose.packet[1] = vecs[1];
+  transpose.packet[2] = vecs[2];
+  transpose.packet[3] = vecs[3];
+  ptranspose(transpose);
+
+  Packet4f sum = padd(transpose.packet[0], transpose.packet[1]);
+  sum = padd(sum, transpose.packet[2]);
+  sum = padd(sum, transpose.packet[3]);
+  return sum;
+}
+
 // Other reduction functions:
 // mul
 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
@@ -507,6 +806,12 @@ template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
  return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
 }

+template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{
+  // Return predux_mul<Packet2d> of the subvectors product
+  return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
+}
+
 // min
 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
 {
@@ -521,6 +826,14 @@ template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
  return pfirst(pmin<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
 }

+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  Packet2d b, res;
+  b   = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
+  res = pmin<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+  return static_cast<float>(pfirst(res));
+}
+
 // max
 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
 {
@@ -536,6 +849,14 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
  return pfirst(pmax<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
 }

+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  Packet2d b, res;
+  b   = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
+  res = pmax<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+  return static_cast<float>(pfirst(res));
+}
+
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4i,4>& kernel) {
  Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
@@ -556,12 +877,61 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
  kernel.packet[1] = t1;
 }

+/* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one
+ */
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  PacketBlock<Packet2d,2> t0,t1,t2,t3;
+  // copy top-left 2x2 Packet2d block
+  t0.packet[0] = kernel.packet[0].v4f[0];
+  t0.packet[1] = kernel.packet[1].v4f[0];
+
+  // copy top-right 2x2 Packet2d block
+  t1.packet[0] = kernel.packet[0].v4f[1];
+  t1.packet[1] = kernel.packet[1].v4f[1];
+
+  // copy bottom-left 2x2 Packet2d block
+  t2.packet[0] = kernel.packet[2].v4f[0];
+  t2.packet[1] = kernel.packet[3].v4f[0];
+
+  // copy bottom-right 2x2 Packet2d block
+  t3.packet[0] = kernel.packet[2].v4f[1];
+  t3.packet[1] = kernel.packet[3].v4f[1];
+
+  // Transpose all 2x2 blocks
+  ptranspose(t0);
+  ptranspose(t1);
+  ptranspose(t2);
+  ptranspose(t3);
+
+  // Copy back transposed blocks, but exchange t1 and t2 due to transposition
+  kernel.packet[0].v4f[0] = t0.packet[0];
+  kernel.packet[0].v4f[1] = t2.packet[0];
+  kernel.packet[1].v4f[0] = t0.packet[1];
+  kernel.packet[1].v4f[1] = t2.packet[1];
+  kernel.packet[2].v4f[0] = t1.packet[0];
+  kernel.packet[2].v4f[1] = t3.packet[0];
+  kernel.packet[3].v4f[0] = t1.packet[1];
+  kernel.packet[3].v4f[1] = t3.packet[1];
+}
+
 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
  return vec_sel(elsePacket, thenPacket, mask);
 }

+template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+  Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] };
+  Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] };
+  Packet2ul mask_hi = vec_cmpeq(select_hi, reinterpret_cast<Packet2ul>(p2l_ONE));
+  Packet2ul mask_lo = vec_cmpeq(select_lo, reinterpret_cast<Packet2ul>(p2l_ONE));
+  Packet4f result;
+  result.v4f[0] = vec_sel(elsePacket.v4f[0], thenPacket.v4f[0], mask_hi);
+  result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo);
+  return result;
+}
+
 template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
  Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
  Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -45,7 +45,7 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
  linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
    m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)),
    m_interPacket(plset<Packet>(0)),
-    m_flip(std::abs(high)<std::abs(low))
+    m_flip(numext::abs(high)<numext::abs(low))
  {}

  template<typename IndexType>
@@ -94,7 +94,7 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/true>
    m_low(low),
    m_multiplier((high-low)/convert_index<Scalar>(num_steps<=1 ? 1 : num_steps-1)),
    m_divisor(convert_index<Scalar>(num_steps+high-low)/(high-low+1)),
-    m_use_divisor((high-low+1)<num_steps)
+    m_use_divisor((high+1)<(low+num_steps))
  {}

  template<typename IndexType>
@@ -173,6 +173,13 @@ template<typename Scalar, typename PacketType,typename IndexType>
 struct has_unary_operator<linspaced_op<Scalar,PacketType>,IndexType> { enum { value = 1}; };
 template<typename Scalar, typename PacketType,typename IndexType>
 struct has_binary_operator<linspaced_op<Scalar,PacketType>,IndexType> { enum { value = 0}; };
+
+template<typename Scalar,typename IndexType>
+struct has_nullary_operator<scalar_random_op<Scalar>,IndexType> { enum { value = 1}; };
+template<typename Scalar,typename IndexType>
+struct has_unary_operator<scalar_random_op<Scalar>,IndexType> { enum { value = 0}; };
+template<typename Scalar,typename IndexType>
+struct has_binary_operator<scalar_random_op<Scalar>,IndexType> { enum { value = 0}; };
 #endif

 } // end namespace internal
--- a/Eigen/src/Core/functors/StlFunctors.h
+++ b/Eigen/src/Core/functors/StlFunctors.h
@@ -72,7 +72,7 @@ template<typename T>
 struct functor_traits<std::not_equal_to<T> >
 { enum { Cost = 1, PacketAccess = false }; };

-#if(__cplusplus < 201103L)
+#if (__cplusplus < 201103L) && (EIGEN_COMP_MSVC <= 1900)
 // std::binder* are deprecated since c++11 and will be removed in c++17
 template<typename T>
 struct functor_traits<std::binder2nd<T> >
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -972,7 +972,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
              internal::prefetch(blA+(3*K+16)*LhsProgress); \
-              if (EIGEN_ARCH_ARM) internal::prefetch(blB+(4*K+16)*RhsProgress); /* Bug 953 */ \
+              if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -199,7 +199,7 @@ struct general_product_to_triangular_selector;
 template<typename MatrixType, typename ProductType, int UpLo>
 struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
 {
-  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha)
+  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha, bool beta)
  {
    typedef typename MatrixType::Scalar Scalar;
    
@@ -217,6 +217,9 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>

    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());

+    if(!beta)
+      mat.template triangularView<UpLo>().setZero();
+
    enum {
      StorageOrder = (internal::traits<MatrixType>::Flags&RowMajorBit) ? RowMajor : ColMajor,
      UseLhsDirectly = _ActualLhs::InnerStrideAtCompileTime==1,
@@ -244,7 +247,7 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
 template<typename MatrixType, typename ProductType, int UpLo>
 struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
 {
-  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha)
+  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha, bool beta)
  {
    typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
    typedef internal::blas_traits<Lhs> LhsBlasTraits;
@@ -260,6 +263,9 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>

    typename ProductType::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());

+    if(!beta)
+      mat.template triangularView<UpLo>().setZero();
+
    enum {
      IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0,
      LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0,
@@ -286,11 +292,11 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>

 template<typename MatrixType, unsigned int UpLo>
 template<typename ProductType>
-TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha)
+TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
 {
  eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols());
  
-  general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha);
+  general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta);
  
  return derived();
 }
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
@@ -33,7 +33,7 @@
 #ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H
 #define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {

@@ -86,8 +86,8 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
  /* typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;*/ \
 \
   BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
-   char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'T':'N'; \
-   EIGTYPE beta; \
+   char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \
+   EIGTYPE beta(1); \
   BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \
  } \
 };
@@ -107,7 +107,7 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
   typedef Matrix<EIGTYPE, Dynamic, Dynamic, AStorageOrder> MatrixType; \
 \
   BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
-   char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'C':'N'; \
+   char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'C':'N'); \
   RTYPE alpha_, beta_; \
   const EIGTYPE* a_ptr; \
 \
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -183,7 +183,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
    }
  }

-/* Optimized triangular solver with multiple left hand sides and the trinagular matrix on the right
+/* Optimized triangular solver with multiple left hand sides and the triangular matrix on the right
 */
 template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
 struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>
@@ -202,6 +202,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
    level3_blocking<Scalar,Scalar>& blocking)
  {
    Index rows = otherSize;
+    typedef typename NumTraits<Scalar>::Real RealScalar;

    typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper;
    typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
@@ -306,9 +307,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
              }
              if((Mode & UnitDiag)==0)
              {
-                Scalar b = conj(rhs(j,j));
+                Scalar inv_rjj = RealScalar(1)/conj(rhs(j,j));
                for (Index i=0; i<actual_mc; ++i)
-                  r[i] /= b;
+                  r[i] *= inv_rjj;
              }
            }

--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -12,8 +12,8 @@
 #define EIGEN_MACROS_H

 #define EIGEN_WORLD_VERSION 3
-#define EIGEN_MAJOR_VERSION 2
-#define EIGEN_MINOR_VERSION 95
+#define EIGEN_MAJOR_VERSION 3
+#define EIGEN_MINOR_VERSION 2

 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                      (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
@@ -356,7 +356,7 @@
 #define EIGEN_MAX_CPP_VER 99
 #endif

-#if EIGEN_MAX_CPP_VER>=11 && defined(__cplusplus) && (__cplusplus >= 201103L)
+#if EIGEN_MAX_CPP_VER>=11 && (defined(__cplusplus) && (__cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900)
 #define EIGEN_HAS_CXX11 1
 #else
 #define EIGEN_HAS_CXX11 0
@@ -497,10 +497,11 @@
 // attribute to maximize inlining. This should only be used when really necessary: in particular,
 // it uses __attribute__((always_inline)) on GCC, which most of the time is useless and can severely harm compile times.
 // FIXME with the always_inline attribute,
-// gcc 3.4.x reports the following compilation error:
+// gcc 3.4.x and 4.1 reports the following compilation error:
 //   Eval.h:91: sorry, unimplemented: inlining failed in call to 'const Eigen::Eval<Derived> Eigen::MatrixBase<Scalar, Derived>::eval() const'
 //    : function body not available
-#if EIGEN_GNUC_AT_LEAST(4,0)
+//   See also bug 1367
+#if EIGEN_GNUC_AT_LEAST(4,2)
 #define EIGEN_ALWAYS_INLINE __attribute__((always_inline)) inline
 #else
 #define EIGEN_ALWAYS_INLINE EIGEN_STRONG_INLINE
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -523,7 +523,7 @@ template<typename T> struct smart_memmove_helper<T,true> {
 template<typename T> struct smart_memmove_helper<T,false> {
  static inline void run(const T* start, const T* end, T* target)
  { 
-    if (uintptr_t(target) < uintptr_t(start))
+    if (UIntPtr(target) < UIntPtr(start))
    {
      std::copy(start, end, target);
    }
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -381,12 +381,12 @@ struct has_ReturnType
  enum { value = sizeof(testFunctor<T>(0)) == sizeof(meta_yes) };
 };

-template<typename T> const T& return_ref();
+template<typename T> const T* return_ptr();

 template <typename T, typename IndexType=Index>
 struct has_nullary_operator
 {
-  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ref<C>().operator()())>0)>::type * = 0);
+  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr<C>()->operator()())>0)>::type * = 0);
  static meta_no testFunctor(...);

  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
@@ -395,7 +395,7 @@ struct has_nullary_operator
 template <typename T, typename IndexType=Index>
 struct has_unary_operator
 {
-  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ref<C>().operator()(IndexType(0)))>0)>::type * = 0);
+  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr<C>()->operator()(IndexType(0)))>0)>::type * = 0);
  static meta_no testFunctor(...);

  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
@@ -404,7 +404,7 @@ struct has_unary_operator
 template <typename T, typename IndexType=Index>
 struct has_binary_operator
 {
-  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ref<C>().operator()(IndexType(0),IndexType(0)))>0)>::type * = 0);
+  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr<C>()->operator()(IndexType(0),IndexType(0)))>0)>::type * = 0);
  static meta_no testFunctor(...);

  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -445,15 +445,11 @@ template<typename T, int n, typename PlainObject = typename plain_object_eval<T>
                                                  //      Another solution could be to count the number of temps?
    NAsInteger = n == Dynamic ? HugeCost : n,
    CostEval   = (NAsInteger+1) * ScalarReadCost + CoeffReadCost,
-    CostNoEval = NAsInteger * CoeffReadCost
+    CostNoEval = NAsInteger * CoeffReadCost,
+    Evaluate = (int(evaluator<T>::Flags) & EvalBeforeNestingBit) || (int(CostEval) < int(CostNoEval))
  };

-  typedef typename conditional<
-        ( (int(evaluator<T>::Flags) & EvalBeforeNestingBit) ||
-          (int(CostEval) < int(CostNoEval)) ),
-        PlainObject,
-        typename ref_selector<T>::type
-  >::type type;
+  typedef typename conditional<Evaluate, PlainObject, typename ref_selector<T>::type>::type type;
 };

 template<typename T>
@@ -536,6 +532,15 @@ template <typename B, typename Functor>                   struct cwise_promote_s
 template <typename Functor>                               struct cwise_promote_storage_type<Sparse,Dense,Functor>                             { typedef Sparse ret; };
 template <typename Functor>                               struct cwise_promote_storage_type<Dense,Sparse,Functor>                             { typedef Sparse ret; };

+template <typename LhsKind, typename RhsKind, int LhsOrder, int RhsOrder> struct cwise_promote_storage_order {
+  enum { value = LhsOrder };
+};
+
+template <typename LhsKind, int LhsOrder, int RhsOrder>   struct cwise_promote_storage_order<LhsKind,Sparse,LhsOrder,RhsOrder>                { enum { value = RhsOrder }; };
+template <typename RhsKind, int LhsOrder, int RhsOrder>   struct cwise_promote_storage_order<Sparse,RhsKind,LhsOrder,RhsOrder>                { enum { value = LhsOrder }; };
+template <int Order>                                      struct cwise_promote_storage_order<Sparse,Sparse,Order,Order>                       { enum { value = Order }; };
+
+
 /** \internal Specify the "storage kind" of multiplying an expression of kind A with kind B.
  * The template parameter ProductTag permits to specialize the resulting storage kind wrt to
  * some compile-time properties of the product: GemmProduct, GemvProduct, OuterProduct, InnerProduct.
--- a/Eigen/src/Geometry/Hyperplane.h
+++ b/Eigen/src/Geometry/Hyperplane.h
@@ -217,7 +217,10 @@ public:
  EIGEN_DEVICE_FUNC inline Hyperplane& transform(const MatrixBase<XprType>& mat, TransformTraits traits = Affine)
  {
    if (traits==Affine)
+    {
      normal() = mat.inverse().transpose() * normal();
+      m_coeffs /= normal().norm();
+    }
    else if (traits==Isometry)
      normal() = mat * normal();
    else
--- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
@@ -330,25 +330,27 @@ public:
  }
  
  /** \internal */
-  template<typename Rhs, typename DestScalar, int DestOptions, typename DestIndex>
-  void _solve_impl(const Rhs& b, SparseMatrix<DestScalar,DestOptions,DestIndex> &dest) const
+  template<typename Rhs, typename DestDerived>
+  void _solve_impl(const Rhs& b, SparseMatrixBase<DestDerived> &aDest) const
  {
    eigen_assert(rows()==b.rows());
    
    Index rhsCols = b.cols();
    Index size = b.rows();
+    DestDerived& dest(aDest.derived());
+    typedef typename DestDerived::Scalar DestScalar;
    Eigen::Matrix<DestScalar,Dynamic,1> tb(size);
    Eigen::Matrix<DestScalar,Dynamic,1> tx(cols());
    // We do not directly fill dest because sparse expressions have to be free of aliasing issue.
    // For non square least-square problems, b and dest might not have the same size whereas they might alias each-other.
-    SparseMatrix<DestScalar,DestOptions,DestIndex> tmp(cols(),rhsCols);
+    typename DestDerived::PlainObject tmp(cols(),rhsCols);
    for(Index k=0; k<rhsCols; ++k)
    {
      tb = b.col(k);
      tx = derived().solve(tb);
      tmp.col(k) = tx.sparseView(0);
    }
-    tmp.swap(dest);
+    dest.swap(tmp);
  }

 protected:
--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h
@@ -506,7 +506,7 @@ void ColPivHouseholderQR<MatrixType>::computeInPlace()
    m_colNormsUpdated.coeffRef(k) = m_colNormsDirect.coeffRef(k);
  }

-  RealScalar threshold_helper =  numext::abs2(m_colNormsUpdated.maxCoeff() * NumTraits<Scalar>::epsilon()) / RealScalar(rows);
+  RealScalar threshold_helper =  numext::abs2<Scalar>(m_colNormsUpdated.maxCoeff() * NumTraits<Scalar>::epsilon()) / RealScalar(rows);
  RealScalar norm_downdate_threshold = numext::sqrt(NumTraits<Scalar>::epsilon());

  m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case)
@@ -557,8 +557,8 @@ void ColPivHouseholderQR<MatrixType>::computeInPlace()
        RealScalar temp = abs(m_qr.coeffRef(k, j)) / m_colNormsUpdated.coeffRef(j);
        temp = (RealScalar(1) + temp) * (RealScalar(1) - temp);
        temp = temp < 0 ? 0 : temp;
-        RealScalar temp2 = temp * numext::abs2(m_colNormsUpdated.coeffRef(j) /
-                                               m_colNormsDirect.coeffRef(j));
+        RealScalar temp2 = temp * numext::abs2<Scalar>(m_colNormsUpdated.coeffRef(j) /
+                                                       m_colNormsDirect.coeffRef(j));
        if (temp2 <= norm_downdate_threshold) {
          // The updated norm has become too inaccurate so re-compute the column
          // norm directly.
--- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h
+++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
@@ -138,7 +138,7 @@ class CompleteOrthogonalDecomposition {
   * problem \f[\mathrm{minimize} \|A X - B\|, \f] where \b A is the matrix of
   * which \c *this is the complete orthogonal decomposition.
   *
-   * \param B the right-hand sides of the problem to solve.
+   * \param b the right-hand sides of the problem to solve.
   *
   * \returns a solution.
   *
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@@ -412,7 +412,7 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
    }

    // update largest diagonal entry
-    maxDiagEntry = numext::maxi(maxDiagEntry,numext::maxi(abs(work_matrix.coeff(p,p)), abs(work_matrix.coeff(q,q))));
+    maxDiagEntry = numext::maxi<RealScalar>(maxDiagEntry,numext::maxi<RealScalar>(abs(work_matrix.coeff(p,p)), abs(work_matrix.coeff(q,q))));
    // and check whether the 2x2 block is already diagonal
    RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
    return abs(work_matrix.coeff(p,q))>threshold || abs(work_matrix.coeff(q,p)) > threshold;
@@ -725,7 +725,7 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
            if(computeV()) m_matrixV.applyOnTheRight(p,q,j_right);

            // keep track of the largest diagonal coefficient
-            maxDiagEntry = numext::maxi<RealScalar>(maxDiagEntry,numext::maxi(abs(m_workMatrix.coeff(p,p)), abs(m_workMatrix.coeff(q,q))));
+            maxDiagEntry = numext::maxi<RealScalar>(maxDiagEntry,numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p,p)), abs(m_workMatrix.coeff(q,q))));
          }
        }
      }
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
--- a/Eigen/src/SparseCore/SparseCompressedBase.h
+++ b/Eigen/src/SparseCore/SparseCompressedBase.h
@@ -224,11 +224,11 @@ class SparseCompressedBase<Derived>::ReverseInnerIterator
      }
      else
      {
-        m_start.value() = mat.outerIndexPtr()[outer];
+        m_start = mat.outerIndexPtr()[outer];
        if(mat.isCompressed())
          m_id = mat.outerIndexPtr()[outer+1];
        else
-          m_id = m_start.value() + mat.innerNonZeroPtr()[outer];
+          m_id = m_start + mat.innerNonZeroPtr()[outer];
      }
    }

@@ -254,14 +254,15 @@ class SparseCompressedBase<Derived>::ReverseInnerIterator
    inline Index row() const { return IsRowMajor ? m_outer.value() : index(); }
    inline Index col() const { return IsRowMajor ? index() : m_outer.value(); }

-    inline operator bool() const { return (m_id > m_start.value()); }
+    inline operator bool() const { return (m_id > m_start); }

  protected:
    const Scalar* m_values;
    const StorageIndex* m_indices;
-    const internal::variable_if_dynamic<Index,Derived::IsVectorAtCompileTime?0:Dynamic> m_outer;
+    typedef internal::variable_if_dynamic<Index,Derived::IsVectorAtCompileTime?0:Dynamic> OuterType;
+    const OuterType m_outer;
+    Index m_start;
    Index m_id;
-    const internal::variable_if_dynamic<Index,Derived::IsVectorAtCompileTime?0:Dynamic> m_start;
 };

 namespace internal {
@@ -272,7 +273,6 @@ struct evaluator<SparseCompressedBase<Derived> >
 {
  typedef typename Derived::Scalar Scalar;
  typedef typename Derived::InnerIterator InnerIterator;
-  typedef typename Derived::ReverseInnerIterator ReverseInnerIterator;
  
  enum {
    CoeffReadCost = NumTraits<Scalar>::ReadCost,
--- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@@ -45,7 +45,7 @@ class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Sparse>
      EIGEN_STATIC_ASSERT((
                (!internal::is_same<typename internal::traits<Lhs>::StorageKind,
                                    typename internal::traits<Rhs>::StorageKind>::value)
-            ||  ((Lhs::Flags&RowMajorBit) == (Rhs::Flags&RowMajorBit))),
+            ||  ((internal::evaluator<Lhs>::Flags&RowMajorBit) == (internal::evaluator<Rhs>::Flags&RowMajorBit))),
            THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH);
    }
 };
@@ -68,7 +68,6 @@ protected:
  typedef typename XprType::StorageIndex StorageIndex;
 public:

-  class ReverseInnerIterator;
  class InnerIterator
  {
  public:
@@ -111,6 +110,7 @@ public:
    EIGEN_STRONG_INLINE Scalar value() const { return m_value; }

    EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
+    EIGEN_STRONG_INLINE Index outer() const { return m_lhsIter.outer(); }
    EIGEN_STRONG_INLINE Index row() const { return Lhs::IsRowMajor ? m_lhsIter.row() : index(); }
    EIGEN_STRONG_INLINE Index col() const { return Lhs::IsRowMajor ? index() : m_lhsIter.col(); }

@@ -161,7 +161,6 @@ protected:
  typedef typename XprType::StorageIndex StorageIndex;
 public:

-  class ReverseInnerIterator;
  class InnerIterator
  {
    enum { IsRowMajor = (int(Rhs::Flags)&RowMajorBit)==RowMajorBit };
@@ -195,6 +194,7 @@ public:
    EIGEN_STRONG_INLINE Scalar value() const { eigen_internal_assert(m_id<m_innerSize); return m_value; }

    EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
+    EIGEN_STRONG_INLINE Index outer() const { return m_rhsIter.outer(); }
    EIGEN_STRONG_INLINE Index row() const { return IsRowMajor ? m_rhsIter.outer() : m_id; }
    EIGEN_STRONG_INLINE Index col() const { return IsRowMajor ? m_id : m_rhsIter.outer(); }

@@ -249,7 +249,6 @@ protected:
  typedef typename XprType::StorageIndex StorageIndex;
 public:

-  class ReverseInnerIterator;
  class InnerIterator
  {
    enum { IsRowMajor = (int(Lhs::Flags)&RowMajorBit)==RowMajorBit };
@@ -283,6 +282,7 @@ public:
    EIGEN_STRONG_INLINE Scalar value() const { eigen_internal_assert(m_id<m_innerSize); return m_value; }

    EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
+    EIGEN_STRONG_INLINE Index outer() const { return m_lhsIter.outer(); }
    EIGEN_STRONG_INLINE Index row() const { return IsRowMajor ? m_lhsIter.outer() : m_id; }
    EIGEN_STRONG_INLINE Index col() const { return IsRowMajor ? m_id : m_lhsIter.outer(); }

@@ -325,26 +325,88 @@ protected:
  const XprType &m_expr;
 };

+template<typename T,
+         typename LhsKind   = typename evaluator_traits<typename T::Lhs>::Kind,
+         typename RhsKind   = typename evaluator_traits<typename T::Rhs>::Kind,
+         typename LhsScalar = typename traits<typename T::Lhs>::Scalar,
+         typename RhsScalar = typename traits<typename T::Rhs>::Scalar> struct sparse_conjunction_evaluator;
+
 // "sparse .* sparse"
 template<typename T1, typename T2, typename Lhs, typename Rhs>
 struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs>, IteratorBased, IteratorBased>
-  : evaluator_base<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> >
+  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+// "dense .* sparse"
+template<typename T1, typename T2, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs>, IndexBased, IteratorBased>
+  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+// "sparse .* dense"
+template<typename T1, typename T2, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs>, IteratorBased, IndexBased>
+  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+// "sparse && sparse"
+template<typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs>, IteratorBased, IteratorBased>
+  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+// "dense && sparse"
+template<typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs>, IndexBased, IteratorBased>
+  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+// "sparse && dense"
+template<typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs>, IteratorBased, IndexBased>
+  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+// "sparse ^ sparse"
+template<typename XprType>
+struct sparse_conjunction_evaluator<XprType, IteratorBased, IteratorBased>
+  : evaluator_base<XprType>
 {
 protected:
-  typedef scalar_product_op<T1,T2> BinaryOp;
-  typedef typename evaluator<Lhs>::InnerIterator  LhsIterator;
-  typedef typename evaluator<Rhs>::InnerIterator  RhsIterator;
-  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef typename XprType::Functor BinaryOp;
+  typedef typename XprType::Lhs LhsArg;
+  typedef typename XprType::Rhs RhsArg;
+  typedef typename evaluator<LhsArg>::InnerIterator  LhsIterator;
+  typedef typename evaluator<RhsArg>::InnerIterator  RhsIterator;
  typedef typename XprType::StorageIndex StorageIndex;
  typedef typename traits<XprType>::Scalar Scalar;
 public:

-  class ReverseInnerIterator;
  class InnerIterator
  {
  public:
    
-    EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
+    EIGEN_STRONG_INLINE InnerIterator(const sparse_conjunction_evaluator& aEval, Index outer)
      : m_lhsIter(aEval.m_lhsImpl,outer), m_rhsIter(aEval.m_rhsImpl,outer), m_functor(aEval.m_functor)
    {
      while (m_lhsIter && m_rhsIter && (m_lhsIter.index() != m_rhsIter.index()))
@@ -373,6 +435,7 @@ public:
    EIGEN_STRONG_INLINE Scalar value() const { return m_functor(m_lhsIter.value(), m_rhsIter.value()); }

    EIGEN_STRONG_INLINE StorageIndex index() const { return m_lhsIter.index(); }
+    EIGEN_STRONG_INLINE Index outer() const { return m_lhsIter.outer(); }
    EIGEN_STRONG_INLINE Index row() const { return m_lhsIter.row(); }
    EIGEN_STRONG_INLINE Index col() const { return m_lhsIter.col(); }

@@ -386,11 +449,11 @@ public:
  
  
  enum {
-    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    CoeffReadCost = evaluator<LhsArg>::CoeffReadCost + evaluator<RhsArg>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
    Flags = XprType::Flags
  };
  
-  explicit binary_evaluator(const XprType& xpr)
+  explicit sparse_conjunction_evaluator(const XprType& xpr)
    : m_functor(xpr.functor()),
      m_lhsImpl(xpr.lhs()), 
      m_rhsImpl(xpr.rhs())  
@@ -405,32 +468,32 @@ public:

 protected:
  const BinaryOp m_functor;
-  evaluator<Lhs> m_lhsImpl;
-  evaluator<Rhs> m_rhsImpl;
+  evaluator<LhsArg> m_lhsImpl;
+  evaluator<RhsArg> m_rhsImpl;
 };

-// "dense .* sparse"
-template<typename T1, typename T2, typename Lhs, typename Rhs>
-struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs>, IndexBased, IteratorBased>
-  : evaluator_base<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> >
+// "dense ^ sparse"
+template<typename XprType>
+struct sparse_conjunction_evaluator<XprType, IndexBased, IteratorBased>
+  : evaluator_base<XprType>
 {
 protected:
-  typedef scalar_product_op<T1,T2> BinaryOp;
-  typedef evaluator<Lhs>  LhsEvaluator;
-  typedef typename evaluator<Rhs>::InnerIterator  RhsIterator;
-  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef typename XprType::Functor BinaryOp;
+  typedef typename XprType::Lhs LhsArg;
+  typedef typename XprType::Rhs RhsArg;
+  typedef evaluator<LhsArg> LhsEvaluator;
+  typedef typename evaluator<RhsArg>::InnerIterator  RhsIterator;
  typedef typename XprType::StorageIndex StorageIndex;
  typedef typename traits<XprType>::Scalar Scalar;
 public:

-  class ReverseInnerIterator;
  class InnerIterator
  {
-    enum { IsRowMajor = (int(Rhs::Flags)&RowMajorBit)==RowMajorBit };
+    enum { IsRowMajor = (int(RhsArg::Flags)&RowMajorBit)==RowMajorBit };

  public:
    
-    EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
+    EIGEN_STRONG_INLINE InnerIterator(const sparse_conjunction_evaluator& aEval, Index outer)
      : m_lhsEval(aEval.m_lhsImpl), m_rhsIter(aEval.m_rhsImpl,outer), m_functor(aEval.m_functor), m_outer(outer)
    {}

@@ -444,6 +507,7 @@ public:
    { return m_functor(m_lhsEval.coeff(IsRowMajor?m_outer:m_rhsIter.index(),IsRowMajor?m_rhsIter.index():m_outer), m_rhsIter.value()); }

    EIGEN_STRONG_INLINE StorageIndex index() const { return m_rhsIter.index(); }
+    EIGEN_STRONG_INLINE Index outer() const { return m_rhsIter.outer(); }
    EIGEN_STRONG_INLINE Index row() const { return m_rhsIter.row(); }
    EIGEN_STRONG_INLINE Index col() const { return m_rhsIter.col(); }

@@ -458,12 +522,12 @@ public:
  
  
  enum {
-    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    CoeffReadCost = evaluator<LhsArg>::CoeffReadCost + evaluator<RhsArg>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
    // Expose storage order of the sparse expression
-    Flags = (XprType::Flags & ~RowMajorBit) | (int(Rhs::Flags)&RowMajorBit)
+    Flags = (XprType::Flags & ~RowMajorBit) | (int(RhsArg::Flags)&RowMajorBit)
  };
  
-  explicit binary_evaluator(const XprType& xpr)
+  explicit sparse_conjunction_evaluator(const XprType& xpr)
    : m_functor(xpr.functor()),
      m_lhsImpl(xpr.lhs()), 
      m_rhsImpl(xpr.rhs())  
@@ -478,32 +542,32 @@ public:

 protected:
  const BinaryOp m_functor;
-  evaluator<Lhs> m_lhsImpl;
-  evaluator<Rhs> m_rhsImpl;
+  evaluator<LhsArg> m_lhsImpl;
+  evaluator<RhsArg> m_rhsImpl;
 };

-// "sparse .* dense"
-template<typename T1, typename T2, typename Lhs, typename Rhs>
-struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs>, IteratorBased, IndexBased>
-  : evaluator_base<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> >
+// "sparse ^ dense"
+template<typename XprType>
+struct sparse_conjunction_evaluator<XprType, IteratorBased, IndexBased>
+  : evaluator_base<XprType>
 {
 protected:
-  typedef scalar_product_op<T1,T2> BinaryOp;
-  typedef typename evaluator<Lhs>::InnerIterator  LhsIterator;
-  typedef evaluator<Rhs>  RhsEvaluator;
-  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef typename XprType::Functor BinaryOp;
+  typedef typename XprType::Lhs LhsArg;
+  typedef typename XprType::Rhs RhsArg;
+  typedef typename evaluator<LhsArg>::InnerIterator LhsIterator;
+  typedef evaluator<RhsArg> RhsEvaluator;
  typedef typename XprType::StorageIndex StorageIndex;
  typedef typename traits<XprType>::Scalar Scalar;
 public:

-  class ReverseInnerIterator;
  class InnerIterator
  {
-    enum { IsRowMajor = (int(Lhs::Flags)&RowMajorBit)==RowMajorBit };
+    enum { IsRowMajor = (int(LhsArg::Flags)&RowMajorBit)==RowMajorBit };

  public:
    
-    EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
+    EIGEN_STRONG_INLINE InnerIterator(const sparse_conjunction_evaluator& aEval, Index outer)
      : m_lhsIter(aEval.m_lhsImpl,outer), m_rhsEval(aEval.m_rhsImpl), m_functor(aEval.m_functor), m_outer(outer)
    {}

@@ -518,6 +582,7 @@ public:
                       m_rhsEval.coeff(IsRowMajor?m_outer:m_lhsIter.index(),IsRowMajor?m_lhsIter.index():m_outer)); }

    EIGEN_STRONG_INLINE StorageIndex index() const { return m_lhsIter.index(); }
+    EIGEN_STRONG_INLINE Index outer() const { return m_lhsIter.outer(); }
    EIGEN_STRONG_INLINE Index row() const { return m_lhsIter.row(); }
    EIGEN_STRONG_INLINE Index col() const { return m_lhsIter.col(); }

@@ -525,19 +590,19 @@ public:
    
  protected:
    LhsIterator m_lhsIter;
-    const evaluator<Rhs> &m_rhsEval;
+    const evaluator<RhsArg> &m_rhsEval;
    const BinaryOp& m_functor;
    const Index m_outer;
  };
  
  
  enum {
-    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    CoeffReadCost = evaluator<LhsArg>::CoeffReadCost + evaluator<RhsArg>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
    // Expose storage order of the sparse expression
-    Flags = (XprType::Flags & ~RowMajorBit) | (int(Lhs::Flags)&RowMajorBit)
+    Flags = (XprType::Flags & ~RowMajorBit) | (int(LhsArg::Flags)&RowMajorBit)
  };
  
-  explicit binary_evaluator(const XprType& xpr)
+  explicit sparse_conjunction_evaluator(const XprType& xpr)
    : m_functor(xpr.functor()),
      m_lhsImpl(xpr.lhs()), 
      m_rhsImpl(xpr.rhs())  
@@ -552,8 +617,8 @@ public:

 protected:
  const BinaryOp m_functor;
-  evaluator<Lhs> m_lhsImpl;
-  evaluator<Rhs> m_rhsImpl;
+  evaluator<LhsArg> m_lhsImpl;
+  evaluator<RhsArg> m_rhsImpl;
 };

 }
@@ -562,6 +627,22 @@ protected:
 * Implementation of SparseMatrixBase and SparseCwise functions/operators
 ***************************************************************************/

+template<typename Derived>
+template<typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
+{
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
+{
+  call_assignment(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
--- a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
@@ -22,7 +22,6 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>
    typedef CwiseUnaryOp<UnaryOp, ArgType> XprType;

    class InnerIterator;
-    class ReverseInnerIterator;
    
    enum {
      CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
@@ -41,7 +40,6 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>

  protected:
    typedef typename evaluator<ArgType>::InnerIterator        EvalIterator;
-//     typedef typename evaluator<ArgType>::ReverseInnerIterator EvalReverseIterator;
    
    const UnaryOp m_functor;
    evaluator<ArgType> m_argImpl;
@@ -70,33 +68,6 @@ class unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::InnerIterat
    Scalar& valueRef();
 };

-// template<typename UnaryOp, typename ArgType>
-// class unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::ReverseInnerIterator
-//     : public unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::EvalReverseIterator
-// {
-//     typedef typename XprType::Scalar Scalar;
-//     typedef typename unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::EvalReverseIterator Base;
-//   public:
-// 
-//     EIGEN_STRONG_INLINE ReverseInnerIterator(const XprType& unaryOp, typename XprType::Index outer)
-//       : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
-//     {}
-// 
-//     EIGEN_STRONG_INLINE ReverseInnerIterator& operator--()
-//     { Base::operator--(); return *this; }
-// 
-//     EIGEN_STRONG_INLINE Scalar value() const { return m_functor(Base::value()); }
-// 
-//   protected:
-//     const UnaryOp m_functor;
-//   private:
-//     Scalar& valueRef();
-// };
-
-
-
-
-
 template<typename ViewOp, typename ArgType>
 struct unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>
  : public evaluator_base<CwiseUnaryView<ViewOp,ArgType> >
@@ -105,7 +76,6 @@ struct unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>
    typedef CwiseUnaryView<ViewOp, ArgType> XprType;

    class InnerIterator;
-    class ReverseInnerIterator;
    
    enum {
      CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<ViewOp>::Cost,
@@ -120,7 +90,6 @@ struct unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>

  protected:
    typedef typename evaluator<ArgType>::InnerIterator        EvalIterator;
-//     typedef typename evaluator<ArgType>::ReverseInnerIterator EvalReverseIterator;
    
    const ViewOp m_functor;
    evaluator<ArgType> m_argImpl;
@@ -148,37 +117,16 @@ class unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::InnerItera
    const ViewOp m_functor;
 };

-// template<typename ViewOp, typename ArgType>
-// class unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::ReverseInnerIterator
-//     : public unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::EvalReverseIterator
-// {
-//     typedef typename XprType::Scalar Scalar;
-//     typedef typename unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::EvalReverseIterator Base;
-//   public:
-// 
-//     EIGEN_STRONG_INLINE ReverseInnerIterator(const XprType& unaryOp, typename XprType::Index outer)
-//       : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
-//     {}
-// 
-//     EIGEN_STRONG_INLINE ReverseInnerIterator& operator--()
-//     { Base::operator--(); return *this; }
-// 
-//     EIGEN_STRONG_INLINE Scalar value() const { return m_functor(Base::value()); }
-//     EIGEN_STRONG_INLINE Scalar& valueRef() { return m_functor(Base::valueRef()); }
-// 
-//   protected:
-//     const ViewOp m_functor;
-// };
-
-
 } // end namespace internal

 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
 SparseMatrixBase<Derived>::operator*=(const Scalar& other)
 {
+  typedef typename internal::evaluator<Derived>::InnerIterator EvalIterator;
+  internal::evaluator<Derived> thisEval(derived());
  for (Index j=0; j<outerSize(); ++j)
-    for (typename Derived::InnerIterator i(derived(),j); i; ++i)
+    for (EvalIterator i(thisEval,j); i; ++i)
      i.valueRef() *= other;
  return derived();
 }
@@ -187,8 +135,10 @@ template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
 SparseMatrixBase<Derived>::operator/=(const Scalar& other)
 {
+  typedef typename internal::evaluator<Derived>::InnerIterator EvalIterator;
+  internal::evaluator<Derived> thisEval(derived());
  for (Index j=0; j<outerSize(); ++j)
-    for (typename Derived::InnerIterator i(derived(),j); i; ++i)
+    for (EvalIterator i(thisEval,j); i; ++i)
      i.valueRef() /= other;
  return derived();
 }
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -32,18 +32,22 @@ namespace Eigen {
  * \tparam _Scalar the scalar type, i.e. the type of the coefficients
  * \tparam _Options Union of bit flags controlling the storage scheme. Currently the only possibility
  *                 is ColMajor or RowMajor. The default is 0 which means column-major.
-  * \tparam _Index the type of the indices. It has to be a \b signed type (e.g., short, int, std::ptrdiff_t). Default is \c int.
+  * \tparam _StorageIndex the type of the indices. It has to be a \b signed type (e.g., short, int, std::ptrdiff_t). Default is \c int.
+  *
+  * \warning In %Eigen 3.2, the undocumented type \c SparseMatrix::Index was improperly defined as the storage index type (e.g., int),
+  *          whereas it is now (starting from %Eigen 3.3) deprecated and always defined as Eigen::Index.
+  *          Codes making use of \c SparseMatrix::Index, might thus likely have to be changed to use \c SparseMatrix::StorageIndex instead.
  *
  * This class can be extended with the help of the plugin mechanism described on the page
  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_SPARSEMATRIX_PLUGIN.
  */

 namespace internal {
-template<typename _Scalar, int _Options, typename _Index>
-struct traits<SparseMatrix<_Scalar, _Options, _Index> >
+template<typename _Scalar, int _Options, typename _StorageIndex>
+struct traits<SparseMatrix<_Scalar, _Options, _StorageIndex> >
 {
  typedef _Scalar Scalar;
-  typedef _Index StorageIndex;
+  typedef _StorageIndex StorageIndex;
  typedef Sparse StorageKind;
  typedef MatrixXpr XprKind;
  enum {
@@ -56,16 +60,16 @@ struct traits<SparseMatrix<_Scalar, _Options, _Index> >
  };
 };

-template<typename _Scalar, int _Options, typename _Index, int DiagIndex>
-struct traits<Diagonal<SparseMatrix<_Scalar, _Options, _Index>, DiagIndex> >
+template<typename _Scalar, int _Options, typename _StorageIndex, int DiagIndex>
+struct traits<Diagonal<SparseMatrix<_Scalar, _Options, _StorageIndex>, DiagIndex> >
 {
-  typedef SparseMatrix<_Scalar, _Options, _Index> MatrixType;
+  typedef SparseMatrix<_Scalar, _Options, _StorageIndex> MatrixType;
  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
  typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;

  typedef _Scalar Scalar;
  typedef Dense StorageKind;
-  typedef _Index StorageIndex;
+  typedef _StorageIndex StorageIndex;
  typedef MatrixXpr XprKind;

  enum {
@@ -77,9 +81,9 @@ struct traits<Diagonal<SparseMatrix<_Scalar, _Options, _Index>, DiagIndex> >
  };
 };

-template<typename _Scalar, int _Options, typename _Index, int DiagIndex>
-struct traits<Diagonal<const SparseMatrix<_Scalar, _Options, _Index>, DiagIndex> >
- : public traits<Diagonal<SparseMatrix<_Scalar, _Options, _Index>, DiagIndex> >
+template<typename _Scalar, int _Options, typename _StorageIndex, int DiagIndex>
+struct traits<Diagonal<const SparseMatrix<_Scalar, _Options, _StorageIndex>, DiagIndex> >
+ : public traits<Diagonal<SparseMatrix<_Scalar, _Options, _StorageIndex>, DiagIndex> >
 {
  enum {
    Flags = 0
@@ -88,12 +92,13 @@ struct traits<Diagonal<const SparseMatrix<_Scalar, _Options, _Index>, DiagIndex>

 } // end namespace internal

-template<typename _Scalar, int _Options, typename _Index>
+template<typename _Scalar, int _Options, typename _StorageIndex>
 class SparseMatrix
-  : public SparseCompressedBase<SparseMatrix<_Scalar, _Options, _Index> >
+  : public SparseCompressedBase<SparseMatrix<_Scalar, _Options, _StorageIndex> >
 {
    typedef SparseCompressedBase<SparseMatrix> Base;
    using Base::convert_index;
+    friend class SparseVector<_Scalar,0,_StorageIndex>;
  public:
    using Base::isCompressed;
    using Base::nonZeros;
@@ -785,30 +790,38 @@ class SparseMatrix
      EIGEN_DBG_SPARSE(
        s << "Nonzero entries:\n";
        if(m.isCompressed())
+        {
          for (Index i=0; i<m.nonZeros(); ++i)
            s << "(" << m.m_data.value(i) << "," << m.m_data.index(i) << ") ";
+        }
        else
+        {
          for (Index i=0; i<m.outerSize(); ++i)
          {
            Index p = m.m_outerIndex[i];
            Index pe = m.m_outerIndex[i]+m.m_innerNonZeros[i];
            Index k=p;
-            for (; k<pe; ++k)
+            for (; k<pe; ++k) {
              s << "(" << m.m_data.value(k) << "," << m.m_data.index(k) << ") ";
-            for (; k<m.m_outerIndex[i+1]; ++k)
+            }
+            for (; k<m.m_outerIndex[i+1]; ++k) {
              s << "(_,_) ";
+            }
          }
+        }
        s << std::endl;
        s << std::endl;
        s << "Outer pointers:\n";
-        for (Index i=0; i<m.outerSize(); ++i)
+        for (Index i=0; i<m.outerSize(); ++i) {
          s << m.m_outerIndex[i] << " ";
+        }
        s << " $" << std::endl;
        if(!m.isCompressed())
        {
          s << "Inner non zeros:\n";
-          for (Index i=0; i<m.outerSize(); ++i)
+          for (Index i=0; i<m.outerSize(); ++i) {
            s << m.m_innerNonZeros[i] << " ";
+          }
          s << " $" << std::endl;
        }
        s << std::endl;
@@ -975,11 +988,11 @@ void set_from_triplets(const InputIterator& begin, const InputIterator& end, Spa
  * an abstract iterator over a complex data-structure that would be expensive to evaluate. The triplets should rather
  * be explicitely stored into a std::vector for instance.
  */
-template<typename Scalar, int _Options, typename _Index>
+template<typename Scalar, int _Options, typename _StorageIndex>
 template<typename InputIterators>
-void SparseMatrix<Scalar,_Options,_Index>::setFromTriplets(const InputIterators& begin, const InputIterators& end)
+void SparseMatrix<Scalar,_Options,_StorageIndex>::setFromTriplets(const InputIterators& begin, const InputIterators& end)
 {
-  internal::set_from_triplets<InputIterators, SparseMatrix<Scalar,_Options,_Index> >(begin, end, *this, internal::scalar_sum_op<Scalar,Scalar>());
+  internal::set_from_triplets<InputIterators, SparseMatrix<Scalar,_Options,_StorageIndex> >(begin, end, *this, internal::scalar_sum_op<Scalar,Scalar>());
 }

 /** The same as setFromTriplets but when duplicates are met the functor \a dup_func is applied:
@@ -991,17 +1004,17 @@ void SparseMatrix<Scalar,_Options,_Index>::setFromTriplets(const InputIterators&
  * mat.setFromTriplets(triplets.begin(), triplets.end(), [] (const Scalar&,const Scalar &b) { return b; });
  * \endcode
  */
-template<typename Scalar, int _Options, typename _Index>
+template<typename Scalar, int _Options, typename _StorageIndex>
 template<typename InputIterators,typename DupFunctor>
-void SparseMatrix<Scalar,_Options,_Index>::setFromTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func)
+void SparseMatrix<Scalar,_Options,_StorageIndex>::setFromTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func)
 {
-  internal::set_from_triplets<InputIterators, SparseMatrix<Scalar,_Options,_Index>, DupFunctor>(begin, end, *this, dup_func);
+  internal::set_from_triplets<InputIterators, SparseMatrix<Scalar,_Options,_StorageIndex>, DupFunctor>(begin, end, *this, dup_func);
 }

 /** \internal */
-template<typename Scalar, int _Options, typename _Index>
+template<typename Scalar, int _Options, typename _StorageIndex>
 template<typename DupFunctor>
-void SparseMatrix<Scalar,_Options,_Index>::collapseDuplicates(DupFunctor dup_func)
+void SparseMatrix<Scalar,_Options,_StorageIndex>::collapseDuplicates(DupFunctor dup_func)
 {
  eigen_assert(!isCompressed());
  // TODO, in practice we should be able to use m_innerNonZeros for that task
@@ -1039,9 +1052,9 @@ void SparseMatrix<Scalar,_Options,_Index>::collapseDuplicates(DupFunctor dup_fun
  m_data.resize(m_outerIndex[m_outerSize]);
 }

-template<typename Scalar, int _Options, typename _Index>
+template<typename Scalar, int _Options, typename _StorageIndex>
 template<typename OtherDerived>
-EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Options,_Index>::operator=(const SparseMatrixBase<OtherDerived>& other)
+EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_StorageIndex>& SparseMatrix<Scalar,_Options,_StorageIndex>::operator=(const SparseMatrixBase<OtherDerived>& other)
 {
  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
@@ -1112,8 +1125,8 @@ EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Opt
  }
 }

-template<typename _Scalar, int _Options, typename _Index>
-typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Options,_Index>::insert(Index row, Index col)
+template<typename _Scalar, int _Options, typename _StorageIndex>
+typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Scalar,_Options,_StorageIndex>::insert(Index row, Index col)
 {
  eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
  
@@ -1232,8 +1245,8 @@ typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Op
  return insertUncompressed(row,col);
 }
    
-template<typename _Scalar, int _Options, typename _Index>
-EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Options,_Index>::insertUncompressed(Index row, Index col)
+template<typename _Scalar, int _Options, typename _StorageIndex>
+EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Scalar,_Options,_StorageIndex>::insertUncompressed(Index row, Index col)
 {
  eigen_assert(!isCompressed());

@@ -1264,8 +1277,8 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& Sparse
  return (m_data.value(p) = 0);
 }

-template<typename _Scalar, int _Options, typename _Index>
-EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Options,_Index>::insertCompressed(Index row, Index col)
+template<typename _Scalar, int _Options, typename _StorageIndex>
+EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Scalar,_Options,_StorageIndex>::insertCompressed(Index row, Index col)
 {
  eigen_assert(isCompressed());

@@ -1373,12 +1386,12 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& Sparse

 namespace internal {

-template<typename _Scalar, int _Options, typename _Index>
-struct evaluator<SparseMatrix<_Scalar,_Options,_Index> >
-  : evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_Index> > >
+template<typename _Scalar, int _Options, typename _StorageIndex>
+struct evaluator<SparseMatrix<_Scalar,_Options,_StorageIndex> >
+  : evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_StorageIndex> > >
 {
-  typedef evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_Index> > > Base;
-  typedef SparseMatrix<_Scalar,_Options,_Index> SparseMatrixType;
+  typedef evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_StorageIndex> > > Base;
+  typedef SparseMatrix<_Scalar,_Options,_StorageIndex> SparseMatrixType;
  evaluator() : Base() {}
  explicit evaluator(const SparseMatrixType &mat) : Base(mat) {}
 };
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -37,7 +37,11 @@ template<typename Derived> class SparseMatrixBase
    
    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+
+    /** The integer type used to \b store indices within a SparseMatrix.
+      * For a \c SparseMatrix<Scalar,Options,IndexType> it an alias of the third template parameter \c IndexType. */
    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
+
    typedef typename internal::add_const_on_value_type_if_arithmetic<
                         typename internal::packet_traits<Scalar>::type
                     >::type PacketReturnType;
@@ -213,11 +217,12 @@ template<typename Derived> class SparseMatrixBase

      if (Flags&RowMajorBit)
      {
-        const Nested nm(m.derived());
+        Nested nm(m.derived());
+        internal::evaluator<NestedCleaned> thisEval(nm);
        for (Index row=0; row<nm.outerSize(); ++row)
        {
          Index col = 0;
-          for (typename NestedCleaned::InnerIterator it(nm.derived(), row); it; ++it)
+          for (typename internal::evaluator<NestedCleaned>::InnerIterator it(thisEval, row); it; ++it)
          {
            for ( ; col<it.index(); ++col)
              s << "0 ";
@@ -231,10 +236,11 @@ template<typename Derived> class SparseMatrixBase
      }
      else
      {
-        const Nested nm(m.derived());
+        Nested nm(m.derived());
+        internal::evaluator<NestedCleaned> thisEval(nm);
        if (m.cols() == 1) {
          Index row = 0;
-          for (typename NestedCleaned::InnerIterator it(nm.derived(), 0); it; ++it)
+          for (typename internal::evaluator<NestedCleaned>::InnerIterator it(thisEval, 0); it; ++it)
          {
            for ( ; row<it.index(); ++row)
              s << "0" << std::endl;
@@ -263,6 +269,11 @@ template<typename Derived> class SparseMatrixBase
    template<typename OtherDerived>
    Derived& operator-=(const DiagonalBase<OtherDerived>& other);

+    template<typename OtherDerived>
+    Derived& operator+=(const EigenBase<OtherDerived> &other);
+    template<typename OtherDerived>
+    Derived& operator-=(const EigenBase<OtherDerived> &other);
+
    Derived& operator*=(const Scalar& other);
    Derived& operator/=(const Scalar& other);

--- a/Eigen/src/SparseCore/SparseRef.h
+++ b/Eigen/src/SparseCore/SparseRef.h
@@ -185,20 +185,27 @@ class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType
    EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)

    template<typename Derived>
-    inline Ref(const SparseMatrixBase<Derived>& expr)
+    inline Ref(const SparseMatrixBase<Derived>& expr) : m_hasCopy(false)
    {
      construct(expr.derived(), typename Traits::template match<Derived>::type());
    }

-    inline Ref(const Ref& other) : Base(other) {
+    inline Ref(const Ref& other) : Base(other), m_hasCopy(false) {
      // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
    }

    template<typename OtherRef>
-    inline Ref(const RefBase<OtherRef>& other) {
+    inline Ref(const RefBase<OtherRef>& other) : m_hasCopy(false) {
      construct(other.derived(), typename Traits::template match<OtherRef>::type());
    }

+    ~Ref() {
+      if(m_hasCopy) {
+        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
+        obj->~TPlainObjectType();
+      }
+    }
+
  protected:

    template<typename Expression>
@@ -208,6 +215,7 @@ class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType
      {
        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
        ::new (obj) TPlainObjectType(expr);
+        m_hasCopy = true;
        Base::construct(*obj);
      }
      else
@@ -221,11 +229,13 @@ class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType
    {
      TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
      ::new (obj) TPlainObjectType(expr);
+      m_hasCopy = true;
      Base::construct(*obj);
    }

  protected:
    char m_object_bytes[sizeof(TPlainObjectType)];
+    bool m_hasCopy;
 };


@@ -293,20 +303,27 @@ class Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType
    EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)

    template<typename Derived>
-    inline Ref(const SparseMatrixBase<Derived>& expr)
+    inline Ref(const SparseMatrixBase<Derived>& expr) : m_hasCopy(false)
    {
      construct(expr.derived(), typename Traits::template match<Derived>::type());
    }

-    inline Ref(const Ref& other) : Base(other) {
+    inline Ref(const Ref& other) : Base(other), m_hasCopy(false) {
      // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
    }

    template<typename OtherRef>
-    inline Ref(const RefBase<OtherRef>& other) {
+    inline Ref(const RefBase<OtherRef>& other) : m_hasCopy(false) {
      construct(other.derived(), typename Traits::template match<OtherRef>::type());
    }

+    ~Ref() {
+      if(m_hasCopy) {
+        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
+        obj->~TPlainObjectType();
+      }
+    }
+
  protected:

    template<typename Expression>
@@ -320,11 +337,13 @@ class Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType
    {
      TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
      ::new (obj) TPlainObjectType(expr);
+      m_hasCopy = true;
      Base::construct(*obj);
    }

  protected:
    char m_object_bytes[sizeof(TPlainObjectType)];
+    bool m_hasCopy;
 };

 namespace internal {
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h
@@ -222,14 +222,43 @@ template< typename DstXprType, typename SrcXprType, typename Functor>
 struct Assignment<DstXprType, SrcXprType, Functor, SparseSelfAdjoint2Sparse>
 {
  typedef typename DstXprType::StorageIndex StorageIndex;
+  typedef internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> AssignOpType;
+
  template<typename DestScalar,int StorageOrder>
-  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src, const AssignOpType&/*func*/)
  {
    internal::permute_symm_to_fullsymm<SrcXprType::Mode>(src.matrix(), dst);
  }
+
+  // FIXME: the handling of += and -= in sparse matrices should be cleanup so that next two overloads could be reduced to:
+  template<typename DestScalar,int StorageOrder,typename AssignFunc>
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src, const AssignFunc& func)
+  {
+    SparseMatrix<DestScalar,StorageOrder,StorageIndex> tmp(src.rows(),src.cols());
+    run(tmp, src, AssignOpType());
+    call_assignment_no_alias_no_transpose(dst, tmp, func);
+  }
+
+  template<typename DestScalar,int StorageOrder>
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src,
+                  const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>& /* func */)
+  {
+    SparseMatrix<DestScalar,StorageOrder,StorageIndex> tmp(src.rows(),src.cols());
+    run(tmp, src, AssignOpType());
+    dst += tmp;
+  }
+
+  template<typename DestScalar,int StorageOrder>
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src,
+                  const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>& /* func */)
+  {
+    SparseMatrix<DestScalar,StorageOrder,StorageIndex> tmp(src.rows(),src.cols());
+    run(tmp, src, AssignOpType());
+    dst -= tmp;
+  }
  
  template<typename DestScalar>
-  static void run(DynamicSparseMatrix<DestScalar,ColMajor,StorageIndex>& dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  static void run(DynamicSparseMatrix<DestScalar,ColMajor,StorageIndex>& dst, const SrcXprType &src, const AssignOpType&/*func*/)
  {
    // TODO directly evaluate into dst;
    SparseMatrix<DestScalar,ColMajor,StorageIndex> tmp(dst.rows(),dst.cols());
--- a/Eigen/src/SparseCore/SparseSolverBase.h
+++ b/Eigen/src/SparseCore/SparseSolverBase.h
@@ -19,7 +19,8 @@ namespace internal {
  * The rhs is decomposed into small vertical panels which are solved through dense temporaries.
  */
 template<typename Decomposition, typename Rhs, typename Dest>
-void solve_sparse_through_dense_panels(const Decomposition &dec, const Rhs& rhs, Dest &dest)
+typename enable_if<Rhs::ColsAtCompileTime!=1 && Dest::ColsAtCompileTime!=1>::type
+solve_sparse_through_dense_panels(const Decomposition &dec, const Rhs& rhs, Dest &dest)
 {
  EIGEN_STATIC_ASSERT((Dest::Flags&RowMajorBit)==0,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
  typedef typename Dest::Scalar DestScalar;
@@ -40,6 +41,19 @@ void solve_sparse_through_dense_panels(const Decomposition &dec, const Rhs& rhs,
  }
 }

+// Overload for vector as rhs
+template<typename Decomposition, typename Rhs, typename Dest>
+typename enable_if<Rhs::ColsAtCompileTime==1 || Dest::ColsAtCompileTime==1>::type
+solve_sparse_through_dense_panels(const Decomposition &dec, const Rhs& rhs, Dest &dest)
+{
+  typedef typename Dest::Scalar DestScalar;
+  Index size = rhs.rows();
+  Eigen::Matrix<DestScalar,Dynamic,1> rhs_dense(rhs);
+  Eigen::Matrix<DestScalar,Dynamic,1> dest_dense(size);
+  dest_dense = dec.solve(rhs_dense);
+  dest = dest_dense.sparseView();
+}
+
 } // end namespace internal

 /** \class SparseSolverBase
--- a/Eigen/src/SparseCore/SparseTranspose.h
+++ b/Eigen/src/SparseCore/SparseTranspose.h
@@ -56,7 +56,6 @@ struct unary_evaluator<Transpose<ArgType>, IteratorBased>
  : public evaluator_base<Transpose<ArgType> >
 {
    typedef typename evaluator<ArgType>::InnerIterator        EvalIterator;
-    typedef typename evaluator<ArgType>::ReverseInnerIterator EvalReverseIterator;
  public:
    typedef Transpose<ArgType> XprType;
    
@@ -75,17 +74,6 @@ struct unary_evaluator<Transpose<ArgType>, IteratorBased>
      Index col() const { return EvalIterator::row(); }
    };
    
-    class ReverseInnerIterator : public EvalReverseIterator
-    {
-    public:
-      EIGEN_STRONG_INLINE ReverseInnerIterator(const unary_evaluator& unaryOp, Index outer)
-        : EvalReverseIterator(unaryOp.m_argImpl,outer)
-      {}
-      
-      Index row() const { return EvalReverseIterator::col(); }
-      Index col() const { return EvalReverseIterator::row(); }
-    };
-    
    enum {
      CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
      Flags = XprType::Flags
--- a/Eigen/src/SparseCore/SparseTriangularView.h
+++ b/Eigen/src/SparseCore/SparseTriangularView.h
@@ -55,7 +55,10 @@ template<typename MatrixType, unsigned int Mode> class TriangularViewImpl<Matrix
      this->solveInPlace(dst);
    }

+    /** Applies the inverse of \c *this to the dense vector or matrix \a other, "in-place" */
    template<typename OtherDerived> void solveInPlace(MatrixBase<OtherDerived>& other) const;
+
+    /** Applies the inverse of \c *this to the sparse vector or matrix \a other, "in-place" */
    template<typename OtherDerived> void solveInPlace(SparseMatrixBase<OtherDerived>& other) const;
  
 };
--- a/Eigen/src/SparseCore/SparseVector.h
+++ b/Eigen/src/SparseCore/SparseVector.h
@@ -290,6 +290,14 @@ class SparseVector
      m_data.swap(other.m_data);
    }

+    template<int OtherOptions>
+    inline void swap(SparseMatrix<Scalar,OtherOptions,StorageIndex>& other)
+    {
+      eigen_assert(other.outerSize()==1);
+      std::swap(m_size, other.m_innerSize);
+      m_data.swap(other.m_data);
+    }
+
    inline SparseVector& operator=(const SparseVector& other)
    {
      if (other.isRValue())
@@ -403,6 +411,7 @@ struct evaluator<SparseVector<_Scalar,_Options,_Index> >
  : evaluator_base<SparseVector<_Scalar,_Options,_Index> >
 {
  typedef SparseVector<_Scalar,_Options,_Index> SparseVectorType;
+  typedef evaluator_base<SparseVectorType> Base;
  typedef typename SparseVectorType::InnerIterator InnerIterator;
  typedef typename SparseVectorType::ReverseInnerIterator ReverseInnerIterator;
  
@@ -410,20 +419,22 @@ struct evaluator<SparseVector<_Scalar,_Options,_Index> >
    CoeffReadCost = NumTraits<_Scalar>::ReadCost,
    Flags = SparseVectorType::Flags
  };
+
+  evaluator() : Base() {}
  
-  explicit evaluator(const SparseVectorType &mat) : m_matrix(mat)
+  explicit evaluator(const SparseVectorType &mat) : m_matrix(&mat)
  {
    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
  }
  
  inline Index nonZerosEstimate() const {
-    return m_matrix.nonZeros();
+    return m_matrix->nonZeros();
  }
  
-  operator SparseVectorType&() { return m_matrix.const_cast_derived(); }
-  operator const SparseVectorType&() const { return m_matrix; }
+  operator SparseVectorType&() { return m_matrix->const_cast_derived(); }
+  operator const SparseVectorType&() const { return *m_matrix; }
  
-  const SparseVectorType &m_matrix;
+  const SparseVectorType *m_matrix;
 };

 template< typename Dest, typename Src>
--- a/Eigen/src/SparseCore/SparseView.h
+++ b/Eigen/src/SparseCore/SparseView.h
@@ -27,6 +27,20 @@ struct traits<SparseView<MatrixType> > : traits<MatrixType>

 } // end namespace internal

+/** \ingroup SparseCore_Module
+  * \class SparseView
+  *
+  * \brief Expression of a dense or sparse matrix with zero or too small values removed
+  *
+  * \tparam MatrixType the type of the object of which we are removing the small entries
+  *
+  * This class represents an expression of a given dense or sparse matrix with
+  * entries smaller than \c reference * \c epsilon are removed.
+  * It is the return type of MatrixBase::sparseView() and SparseMatrixBase::pruned()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::sparseView(), SparseMatrixBase::pruned()
+  */
 template<typename MatrixType>
 class SparseView : public SparseMatrixBase<SparseView<MatrixType> >
 {
@@ -190,6 +204,23 @@ struct unary_evaluator<SparseView<ArgType>, IndexBased>

 } // end namespace internal

+/** \ingroup SparseCore_Module
+  *
+  * \returns a sparse expression of the dense expression \c *this with values smaller than
+  * \a reference * \a epsilon removed.
+  *
+  * This method is typically used when prototyping to convert a quickly assembled dense Matrix \c D to a SparseMatrix \c S:
+  * \code
+  * MatrixXd D(n,m);
+  * SparseMatrix<double> S;
+  * S = D.sparseView();             // suppress numerical zeros (exact)
+  * S = D.sparseView(reference);
+  * S = D.sparseView(reference,epsilon);
+  * \endcode
+  * where \a reference is a meaningful non zero reference value,
+  * and \a epsilon is a tolerance factor defaulting to NumTraits<Scalar>::dummy_precision().
+  *
+  * \sa SparseMatrixBase::pruned(), class SparseView */
 template<typename Derived>
 const SparseView<Derived> MatrixBase<Derived>::sparseView(const Scalar& reference,
                                                          const typename NumTraits<Scalar>::Real& epsilon) const
@@ -198,7 +229,7 @@ const SparseView<Derived> MatrixBase<Derived>::sparseView(const Scalar& referenc
 }

 /** \returns an expression of \c *this with values smaller than
-  * \a reference * \a epsilon are removed.
+  * \a reference * \a epsilon removed.
  *
  * This method is typically used in conjunction with the product of two sparse matrices
  * to automatically prune the smallest values as follows:
--- a/Eigen/src/SparseCore/TriangularSolver.h
+++ b/Eigen/src/SparseCore/TriangularSolver.h
@@ -171,6 +171,8 @@ struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Upper,ColMajor>

 } // end namespace internal

+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
 template<typename ExpressionType,unsigned int Mode>
 template<typename OtherDerived>
 void TriangularViewImpl<ExpressionType,Mode,Sparse>::solveInPlace(MatrixBase<OtherDerived>& other) const
@@ -189,6 +191,7 @@ void TriangularViewImpl<ExpressionType,Mode,Sparse>::solveInPlace(MatrixBase<Oth
  if (copy)
    other = otherCopy;
 }
+#endif

 // pure sparse path

@@ -286,6 +289,7 @@ struct sparse_solve_triangular_sparse_selector<Lhs,Rhs,Mode,UpLo,ColMajor>

 } // end namespace internal

+#ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename ExpressionType,unsigned int Mode>
 template<typename OtherDerived>
 void TriangularViewImpl<ExpressionType,Mode,Sparse>::solveInPlace(SparseMatrixBase<OtherDerived>& other) const
@@ -304,6 +308,7 @@ void TriangularViewImpl<ExpressionType,Mode,Sparse>::solveInPlace(SparseMatrixBa
 //   if (copy)
 //     other = otherCopy;
 }
+#endif

 } // end namespace Eigen

--- a/Eigen/src/SparseLU/SparseLU.h
+++ b/Eigen/src/SparseLU/SparseLU.h
@@ -748,7 +748,7 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
      else
      {
        Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
-        Map< Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
        U = A.template triangularView<Upper>().solve(U);
      }

--- a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
+++ b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
@@ -239,7 +239,7 @@ void MappedSuperNodalMatrix<Scalar,Index_>::solveInPlace( MatrixBase<Dest>&X) co
    Index n    = int(X.rows());
    Index nrhs = Index(X.cols());
    const Scalar * Lval = valuePtr();                 // Nonzero values 
-    Matrix<Scalar,Dynamic,Dynamic, ColMajor> work(n, nrhs);     // working vector
+    Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor> work(n, nrhs);     // working vector
    work.setZero();
    for (Index k = 0; k <= nsuper(); k ++)
    {
@@ -271,12 +271,12 @@ void MappedSuperNodalMatrix<Scalar,Index_>::solveInPlace( MatrixBase<Dest>&X) co
        
        // Triangular solve 
        Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) );
-        Map< Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); 
+        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
        U = A.template triangularView<UnitLower>().solve(U); 
        
        // Matrix-vector product 
        new (&A) Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > ( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) );
-        work.block(0, 0, nrow, nrhs) = A * U; 
+        work.topRows(nrow).noalias() = A * U;
        
        //Begin Scatter 
        for (Index j = 0; j < nrhs; j++)
--- a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
+++ b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
@@ -106,22 +106,22 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const
        
 #define KMADD(c, a, b, tmp) {tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);}
 #define WORK(I)  \
-                    c0 = pload<Packet>(C0+i+(I)*PacketSize);   \
-                    c1 = pload<Packet>(C1+i+(I)*PacketSize);   \
-                    KMADD(c0, a0, b00, t0)      \
-                    KMADD(c1, a0, b01, t1)      \
-                    a0 = pload<Packet>(A0+i+(I+1)*PacketSize); \
-                    KMADD(c0, a1, b10, t0)      \
-                    KMADD(c1, a1, b11, t1)       \
-                    a1 = pload<Packet>(A1+i+(I+1)*PacketSize); \
-          if(RK==4) KMADD(c0, a2, b20, t0)       \
-          if(RK==4) KMADD(c1, a2, b21, t1)       \
-          if(RK==4) a2 = pload<Packet>(A2+i+(I+1)*PacketSize); \
-          if(RK==4) KMADD(c0, a3, b30, t0)       \
-          if(RK==4) KMADD(c1, a3, b31, t1)       \
-          if(RK==4) a3 = pload<Packet>(A3+i+(I+1)*PacketSize); \
-                    pstore(C0+i+(I)*PacketSize, c0);           \
-                    pstore(C1+i+(I)*PacketSize, c1)
+                     c0 = pload<Packet>(C0+i+(I)*PacketSize);    \
+                     c1 = pload<Packet>(C1+i+(I)*PacketSize);    \
+                     KMADD(c0, a0, b00, t0)                      \
+                     KMADD(c1, a0, b01, t1)                      \
+                     a0 = pload<Packet>(A0+i+(I+1)*PacketSize);  \
+                     KMADD(c0, a1, b10, t0)                      \
+                     KMADD(c1, a1, b11, t1)                      \
+                     a1 = pload<Packet>(A1+i+(I+1)*PacketSize);  \
+          if(RK==4){ KMADD(c0, a2, b20, t0)                     }\
+          if(RK==4){ KMADD(c1, a2, b21, t1)                     }\
+          if(RK==4){ a2 = pload<Packet>(A2+i+(I+1)*PacketSize); }\
+          if(RK==4){ KMADD(c0, a3, b30, t0)                     }\
+          if(RK==4){ KMADD(c1, a3, b31, t1)                     }\
+          if(RK==4){ a3 = pload<Packet>(A3+i+(I+1)*PacketSize); }\
+                     pstore(C0+i+(I)*PacketSize, c0);            \
+                     pstore(C1+i+(I)*PacketSize, c1)
        
        // process rows of A' - C' with aggressive vectorization and peeling 
        for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
@@ -131,14 +131,15 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const
                    prefetch((A1+i+(5)*PacketSize));
          if(RK==4) prefetch((A2+i+(5)*PacketSize));
          if(RK==4) prefetch((A3+i+(5)*PacketSize));
-                    WORK(0);
-                    WORK(1);
-                    WORK(2);
-                    WORK(3);
-                    WORK(4);
-                    WORK(5);
-                    WORK(6);
-                    WORK(7);
+
+          WORK(0);
+          WORK(1);
+          WORK(2);
+          WORK(3);
+          WORK(4);
+          WORK(5);
+          WORK(6);
+          WORK(7);
        }
        // process the remaining rows with vectorization only
        for(Index i=actual_b_end1; i<actual_b_end2; i+=PacketSize)
@@ -203,16 +204,16 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const
        }
        
 #define WORK(I) \
-                  c0 = pload<Packet>(C0+i+(I)*PacketSize);   \
-                  KMADD(c0, a0, b00, t0)       \
-                  a0 = pload<Packet>(A0+i+(I+1)*PacketSize); \
-                  KMADD(c0, a1, b10, t0)       \
-                  a1 = pload<Packet>(A1+i+(I+1)*PacketSize); \
-        if(RK==4) KMADD(c0, a2, b20, t0)       \
-        if(RK==4) a2 = pload<Packet>(A2+i+(I+1)*PacketSize); \
-        if(RK==4) KMADD(c0, a3, b30, t0)       \
-        if(RK==4) a3 = pload<Packet>(A3+i+(I+1)*PacketSize); \
-                  pstore(C0+i+(I)*PacketSize, c0);
+                   c0 = pload<Packet>(C0+i+(I)*PacketSize);     \
+                   KMADD(c0, a0, b00, t0)                       \
+                   a0 = pload<Packet>(A0+i+(I+1)*PacketSize);   \
+                   KMADD(c0, a1, b10, t0)                       \
+                   a1 = pload<Packet>(A1+i+(I+1)*PacketSize);   \
+        if(RK==4){ KMADD(c0, a2, b20, t0)                      }\
+        if(RK==4){ a2 = pload<Packet>(A2+i+(I+1)*PacketSize);  }\
+        if(RK==4){ KMADD(c0, a3, b30, t0)                      }\
+        if(RK==4){ a3 = pload<Packet>(A3+i+(I+1)*PacketSize);  }\
+                   pstore(C0+i+(I)*PacketSize, c0);
        
        // agressive vectorization and peeling
        for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
--- a/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h
@@ -967,6 +967,7 @@ void SuperILU<MatrixType>::factorize(const MatrixType& a)
  m_factorizationIsOk = true;
 }

+#ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatrixType>
 template<typename Rhs,typename Dest>
 void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x) const
@@ -1019,6 +1020,8 @@ void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest
 }
 #endif

+#endif
+
 } // end namespace Eigen

 #endif // EIGEN_SUPERLUSUPPORT_H
--- a/Eigen/src/plugins/ArrayCwiseBinaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseBinaryOps.h
@@ -269,44 +269,6 @@ const CwiseBinaryOp<internal::scalar_difference_op<T,Scalar>,Constant<T>,Derived
  operator/(const T& s,const StorageBaseType& a);
 #endif

-/** \returns an expression of the coefficient-wise && operator of *this and \a other
-  *
-  * \warning this operator is for expression of bool only.
-  *
-  * Example: \include Cwise_boolean_and.cpp
-  * Output: \verbinclude Cwise_boolean_and.out
-  *
-  * \sa operator||(), select()
-  */
-template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-inline const CwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>
-operator&&(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
-{
-  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
-                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
-  return CwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>(derived(),other.derived());
-}
-
-/** \returns an expression of the coefficient-wise || operator of *this and \a other
-  *
-  * \warning this operator is for expression of bool only.
-  *
-  * Example: \include Cwise_boolean_or.cpp
-  * Output: \verbinclude Cwise_boolean_or.out
-  *
-  * \sa operator&&(), select()
-  */
-template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-inline const CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>
-operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
-{
-  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
-                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
-  return CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>(derived(),other.derived());
-}
-
 /** \returns an expression of the coefficient-wise ^ operator of *this and \a other
 *
 * \warning this operator is for expression of bool only.
--- a/Eigen/src/plugins/BlockMethods.h
+++ b/Eigen/src/plugins/BlockMethods.h
@@ -818,7 +818,7 @@ inline typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index sta
  return typename FixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol, blockRows, blockCols);
 }

-/// This is the const version of block<>(Index, Index, Index, Index). */
+/// This is the const version of block<>(Index, Index, Index, Index).
 template<int NRows, int NCols>
 inline const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
                                                              Index blockRows, Index blockCols) const
@@ -832,15 +832,15 @@ inline const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow
 /// Output: \verbinclude MatrixBase_col.out
 ///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
-///
-/// \sa row(), class Block */
+/**
+  * \sa row(), class Block */
 EIGEN_DEVICE_FUNC
 inline ColXpr col(Index i)
 {
  return ColXpr(derived(), i);
 }

-/// This is the const version of col(). */
+/// This is the const version of col().
 EIGEN_DEVICE_FUNC
 inline ConstColXpr col(Index i) const
 {
@@ -853,8 +853,8 @@ inline ConstColXpr col(Index i) const
 /// Output: \verbinclude MatrixBase_row.out
 ///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
-///
-/// \sa col(), class Block */
+/**
+  * \sa col(), class Block */
 EIGEN_DEVICE_FUNC
 inline RowXpr row(Index i)
 {
--- a/Eigen/src/plugins/CommonCwiseBinaryOps.h
+++ b/Eigen/src/plugins/CommonCwiseBinaryOps.h
@@ -75,3 +75,41 @@ EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(operator/,quotient)
 template<typename T>
 const CwiseBinaryOp<internal::scalar_quotient_op<Scalar,T>,Derived,Constant<T> > operator/(const T& scalar) const;
 #endif
+
+/** \returns an expression of the coefficient-wise boolean \b and operator of \c *this and \a other
+  *
+  * \warning this operator is for expression of bool only.
+  *
+  * Example: \include Cwise_boolean_and.cpp
+  * Output: \verbinclude Cwise_boolean_and.out
+  *
+  * \sa operator||(), select()
+  */
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
+inline const CwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>
+operator&&(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
+                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
+  return CwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>(derived(),other.derived());
+}
+
+/** \returns an expression of the coefficient-wise boolean \b or operator of \c *this and \a other
+  *
+  * \warning this operator is for expression of bool only.
+  *
+  * Example: \include Cwise_boolean_or.cpp
+  * Output: \verbinclude Cwise_boolean_or.out
+  *
+  * \sa operator&&(), select()
+  */
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
+inline const CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>
+operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
+                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
+  return CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>(derived(),other.derived());
+}
--- a/README.md
+++ b/README.md
@@ -1,3 +1,3 @@
-**Eigen is a C++ template library for linear algebra: matrices, vectors, numerical solvers, and related algorithms.**
-
-For more information go to http://eigen.tuxfamily.org/.
+**Eigen is a C++ template library for linear algebra: matrices, vectors, numerical solvers, and related algorithms.**
+
+For more information go to http://eigen.tuxfamily.org/.
--- a/cmake/Eigen3Config.cmake.in
+++ b/cmake/Eigen3Config.cmake.in
@@ -1,28 +1,21 @@
-#                                               -*- cmake -*-
-#
-#  Eigen3Config.cmake(.in)
+# This file exports the Eigen3::Eigen CMake target which should be passed to the
+# target_link_libraries command.

-# Use the following variables to compile and link against Eigen:
-#  EIGEN3_FOUND              - True if Eigen was found on your system
-#  EIGEN3_USE_FILE           - The file making Eigen usable
-#  EIGEN3_DEFINITIONS        - Definitions needed to build with Eigen
-#  EIGEN3_INCLUDE_DIR        - Directory where signature_of_eigen3_matrix_library can be found
-#  EIGEN3_INCLUDE_DIRS       - List of directories of Eigen and it's dependencies
-#  EIGEN3_ROOT_DIR           - The base directory of Eigen
-#  EIGEN3_VERSION_STRING     - A human-readable string containing the version
-#  EIGEN3_VERSION_MAJOR      - The major version of Eigen
-#  EIGEN3_VERSION_MINOR      - The minor version of Eigen
-#  EIGEN3_VERSION_PATCH      - The patch version of Eigen
+@PACKAGE_INIT@

-set ( EIGEN3_FOUND 1 )
-set ( EIGEN3_USE_FILE     "${CMAKE_CURRENT_LIST_DIR}/UseEigen3.cmake" )
+include ("${CMAKE_CURRENT_LIST_DIR}/Eigen3Targets.cmake")

-set ( EIGEN3_DEFINITIONS  "@EIGEN_DEFINITIONS@" )
-set ( EIGEN3_INCLUDE_DIR  "@EIGEN_INCLUDE_DIR@" )
-set ( EIGEN3_INCLUDE_DIRS "@EIGEN_INCLUDE_DIRS@" )
-set ( EIGEN3_ROOT_DIR     "@EIGEN_ROOT_DIR@" )
+# Legacy variables, do *not* use. May be removed in the future.

-set ( EIGEN3_VERSION_STRING "@EIGEN_VERSION_STRING@" )
-set ( EIGEN3_VERSION_MAJOR  "@EIGEN_VERSION_MAJOR@" )
-set ( EIGEN3_VERSION_MINOR  "@EIGEN_VERSION_MINOR@" )
-set ( EIGEN3_VERSION_PATCH  "@EIGEN_VERSION_PATCH@" )
+set (EIGEN3_FOUND 1)
+set (EIGEN3_USE_FILE    "${CMAKE_CURRENT_LIST_DIR}/UseEigen3.cmake")
+
+set (EIGEN3_DEFINITIONS  "@EIGEN_DEFINITIONS@")
+set (EIGEN3_INCLUDE_DIR  "@PACKAGE_EIGEN_INCLUDE_DIR@")
+set (EIGEN3_INCLUDE_DIRS "@PACKAGE_EIGEN_INCLUDE_DIR@")
+set (EIGEN3_ROOT_DIR     "@PACKAGE_EIGEN_ROOT_DIR@")
+
+set (EIGEN3_VERSION_STRING "@EIGEN_VERSION_STRING@")
+set (EIGEN3_VERSION_MAJOR  "@EIGEN_VERSION_MAJOR@")
+set (EIGEN3_VERSION_MINOR  "@EIGEN_VERSION_MINOR@")
+set (EIGEN3_VERSION_PATCH  "@EIGEN_VERSION_PATCH@")
--- a/cmake/Eigen3ConfigLegacy.cmake.in
+++ b/cmake/Eigen3ConfigLegacy.cmake.in
@@ -0,0 +1,30 @@
+#                                               -*- cmake -*-
+#
+#  Eigen3Config.cmake(.in)
+
+# Use the following variables to compile and link against Eigen:
+#  EIGEN3_FOUND              - True if Eigen was found on your system
+#  EIGEN3_USE_FILE           - The file making Eigen usable
+#  EIGEN3_DEFINITIONS        - Definitions needed to build with Eigen
+#  EIGEN3_INCLUDE_DIR        - Directory where signature_of_eigen3_matrix_library can be found
+#  EIGEN3_INCLUDE_DIRS       - List of directories of Eigen and it's dependencies
+#  EIGEN3_ROOT_DIR           - The base directory of Eigen
+#  EIGEN3_VERSION_STRING     - A human-readable string containing the version
+#  EIGEN3_VERSION_MAJOR      - The major version of Eigen
+#  EIGEN3_VERSION_MINOR      - The minor version of Eigen
+#  EIGEN3_VERSION_PATCH      - The patch version of Eigen
+
+@PACKAGE_INIT@
+
+set ( EIGEN3_FOUND 1 )
+set ( EIGEN3_USE_FILE     "${CMAKE_CURRENT_LIST_DIR}/UseEigen3.cmake" )
+
+set ( EIGEN3_DEFINITIONS  "@EIGEN_DEFINITIONS@" )
+set ( EIGEN3_INCLUDE_DIR  "@PACKAGE_EIGEN_INCLUDE_DIR@" )
+set ( EIGEN3_INCLUDE_DIRS "@PACKAGE_EIGEN_INCLUDE_DIR@" )
+set ( EIGEN3_ROOT_DIR     "@PACKAGE_EIGEN_ROOT_DIR@" )
+
+set ( EIGEN3_VERSION_STRING "@EIGEN_VERSION_STRING@" )
+set ( EIGEN3_VERSION_MAJOR  "@EIGEN_VERSION_MAJOR@" )
+set ( EIGEN3_VERSION_MINOR  "@EIGEN_VERSION_MINOR@" )
+set ( EIGEN3_VERSION_PATCH  "@EIGEN_VERSION_PATCH@" )
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -1,23 +1,23 @@

 macro(ei_add_property prop value)
-  get_property(previous GLOBAL PROPERTY ${prop})  
+  get_property(previous GLOBAL PROPERTY ${prop})
  if ((NOT previous) OR (previous STREQUAL ""))
    set_property(GLOBAL PROPERTY ${prop} "${value}")
  else()
    set_property(GLOBAL PROPERTY ${prop} "${previous} ${value}")
-  endif()  
+  endif()
 endmacro(ei_add_property)

 #internal. See documentation of ei_add_test for details.
 macro(ei_add_test_internal testname testname_with_suffix)
  set(targetname ${testname_with_suffix})
-  
+
  if(EIGEN_ADD_TEST_FILENAME_EXTENSION)
    set(filename ${testname}.${EIGEN_ADD_TEST_FILENAME_EXTENSION})
  else()
    set(filename ${testname}.cpp)
  endif()
-  
+
  if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu)
    if(EIGEN_TEST_CUDA_CLANG)
      set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)
@@ -42,7 +42,7 @@ macro(ei_add_test_internal testname testname_with_suffix)
  else()
    add_executable(${targetname} ${filename})
  endif()
-  
+
  if (targetname MATCHES "^eigen2_")
    add_dependencies(eigen2_buildtests ${targetname})
  else()
@@ -56,20 +56,20 @@ macro(ei_add_test_internal testname testname_with_suffix)
      ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_DEBUG_ASSERTS=1")
    endif(EIGEN_DEBUG_ASSERTS)
  endif(EIGEN_NO_ASSERTION_CHECKING)
-  
+
  ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}")

  ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_FUNC=${testname}")
-  
+
  if(MSVC)
    ei_add_target_property(${targetname} COMPILE_FLAGS "/bigobj")
-  endif()  
+  endif()

  # let the user pass flags.
  if(${ARGC} GREATER 2)
    ei_add_target_property(${targetname} COMPILE_FLAGS "${ARGV2}")
  endif(${ARGC} GREATER 2)
-  
+
  if(EIGEN_TEST_CUSTOM_CXX_FLAGS)
    ei_add_target_property(${targetname} COMPILE_FLAGS "${EIGEN_TEST_CUSTOM_CXX_FLAGS}")
  endif()
@@ -95,12 +95,12 @@ macro(ei_add_test_internal testname testname_with_suffix)
      # notice: no double quotes around ${libs_to_link} here. It may be a list.
      target_link_libraries(${targetname} ${libs_to_link})
    endif()
-  endif() 
+  endif()

  add_test(${testname_with_suffix} "${targetname}")
-  
+
  # Specify target and test labels accoirding to EIGEN_CURRENT_SUBPROJECT
-  get_property(current_subproject GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT)  
+  get_property(current_subproject GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT)
  if ((current_subproject) AND (NOT (current_subproject STREQUAL "")))
    set_property(TARGET ${targetname} PROPERTY LABELS "Build${current_subproject}")
    add_dependencies("Build${current_subproject}" ${targetname})
@@ -128,14 +128,14 @@ macro(ei_add_test_internal_sycl testname testname_with_suffix)
    OUTPUT ${include_file}
    COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${host_file}\\\"" > ${include_file}
    COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${bc_file}.sycl\\\"" >> ${include_file}
-    DEPENDS ${filename}
+    DEPENDS ${filename} ${bc_file}.sycl
    COMMENT "Building ComputeCpp integration header file ${include_file}"
  )
  # Add a custom target for the generated integration header
-  add_custom_target(${testname}_integration_header_woho DEPENDS ${include_file})
+  add_custom_target(${testname}_integration_header_sycl DEPENDS ${include_file})

  add_executable(${targetname} ${include_file})
-  add_dependencies(${targetname} ${testname}_integration_header_woho)
+  add_dependencies(${targetname} ${testname}_integration_header_sycl)
  add_sycl_to_target(${targetname} ${filename} ${CMAKE_CURRENT_BINARY_DIR})

  if (targetname MATCHES "^eigen2_")
@@ -258,7 +258,7 @@ macro(ei_add_test testname)
  else()
    set(filename ${testname}.cpp)
  endif()
-  
+
  file(READ "${filename}" test_source)
  set(parts 0)
  string(REGEX MATCHALL "CALL_SUBTEST_[0-9]+|EIGEN_TEST_PART_[0-9]+|EIGEN_SUFFIXES(;[0-9]+)+"
@@ -379,7 +379,7 @@ macro(ei_testing_print_summary)
  elseif(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION)
    message(STATUS "Explicit vectorization disabled (alignment kept enabled)")
  else()
-  
+
  message(STATUS "Maximal matrix/vector size: ${EIGEN_TEST_MAX_SIZE}")

    if(EIGEN_TEST_SSE2)
@@ -453,13 +453,13 @@ macro(ei_testing_print_summary)
    else()
      message(STATUS "ARMv8 NEON:        Using architecture defaults")
    endif()
- 
+
    if(EIGEN_TEST_ZVECTOR)
      message(STATUS "S390X ZVECTOR:     ON")
    else()
      message(STATUS "S390X ZVECTOR:     Using architecture defaults")
    endif()
-   
+
    if(EIGEN_TEST_CXX11)
      message(STATUS "C++11:             ON")
    else()
@@ -505,7 +505,7 @@ macro(ei_init_testing)

  set_property(GLOBAL PROPERTY EIGEN_FAILTEST_FAILURE_COUNT "0")
  set_property(GLOBAL PROPERTY EIGEN_FAILTEST_COUNT "0")
-  
+
  # uncomment anytime you change the ei_get_compilerver_from_cxx_version_string macro
  # ei_test_get_compilerver_from_cxx_version_string()
 endmacro(ei_init_testing)
@@ -514,22 +514,22 @@ macro(ei_set_sitename)
  # if the sitename is not yet set, try to set it
  if(NOT ${SITE} OR ${SITE} STREQUAL "")
    set(eigen_computername $ENV{COMPUTERNAME})
-	set(eigen_hostname $ENV{HOSTNAME})
+    set(eigen_hostname $ENV{HOSTNAME})
    if(eigen_hostname)
      set(SITE ${eigen_hostname})
-	elseif(eigen_computername)
-	  set(SITE ${eigen_computername})
+    elseif(eigen_computername)
+      set(SITE ${eigen_computername})
    endif()
  endif()
  # in case it is already set, enforce lower case
  if(SITE)
    string(TOLOWER ${SITE} SITE)
-  endif()  
+  endif()
 endmacro(ei_set_sitename)

 macro(ei_get_compilerver VAR)
    if(MSVC)
-      # on windows system, we use a modified CMake script  
+      # on windows system, we use a modified CMake script
      include(EigenDetermineVSServicePack)
      EigenDetermineVSServicePack( my_service_pack )

@@ -541,20 +541,20 @@ macro(ei_get_compilerver VAR)
    else()
    # on all other system we rely on ${CMAKE_CXX_COMPILER}
    # supporting a "--version" or "/version" flag
-    
+
    if(WIN32 AND ${CMAKE_CXX_COMPILER_ID} EQUAL "Intel")
      set(EIGEN_CXX_FLAG_VERSION "/version")
    else()
      set(EIGEN_CXX_FLAG_VERSION "--version")
    endif()
-    
+
    execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${EIGEN_CXX_FLAG_VERSION}
                    OUTPUT_VARIABLE eigen_cxx_compiler_version_string OUTPUT_STRIP_TRAILING_WHITESPACE)
    string(REGEX REPLACE "[\n\r].*"  ""  eigen_cxx_compiler_version_string  ${eigen_cxx_compiler_version_string})
-    
+
    ei_get_compilerver_from_cxx_version_string("${eigen_cxx_compiler_version_string}" CNAME CVER)
    set(${VAR} "${CNAME}-${CVER}")
-    
+
  endif()
 endmacro(ei_get_compilerver)

@@ -563,13 +563,13 @@ endmacro(ei_get_compilerver)
 # the testing macro call in ei_init_testing() of the EigenTesting.cmake file.
 # See also the ei_test_get_compilerver_from_cxx_version_string macro at the end of the file
 macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER)
-  # extract possible compiler names  
+  # extract possible compiler names
  string(REGEX MATCH "g\\+\\+"      ei_has_gpp    ${VERSTRING})
  string(REGEX MATCH "llvm|LLVM"    ei_has_llvm   ${VERSTRING})
  string(REGEX MATCH "gcc|GCC"      ei_has_gcc    ${VERSTRING})
  string(REGEX MATCH "icpc|ICC"     ei_has_icpc   ${VERSTRING})
  string(REGEX MATCH "clang|CLANG"  ei_has_clang  ${VERSTRING})
-  
+
  # combine them
  if((ei_has_llvm) AND (ei_has_gpp OR ei_has_gcc))
    set(${CNAME} "llvm-g++")
@@ -584,7 +584,7 @@ macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER)
  else()
    set(${CNAME} "_")
  endif()
-  
+
  # extract possible version numbers
  # first try to extract 3 isolated numbers:
  string(REGEX MATCH " [0-9]+\\.[0-9]+\\.[0-9]+" eicver ${VERSTRING})
@@ -602,9 +602,9 @@ macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER)
      endif()
    endif()
  endif()
-  
+
  string(REGEX REPLACE ".(.*)" "\\1" ${CVER} ${eicver})
-  
+
 endmacro(ei_get_compilerver_from_cxx_version_string)

 macro(ei_get_cxxflags VAR)
@@ -633,30 +633,30 @@ macro(ei_get_cxxflags VAR)
  elseif(EIGEN_TEST_SSE3)
    set(${VAR} SSE3)
  elseif(EIGEN_TEST_SSE2 OR IS_64BIT_ENV)
-    set(${VAR} SSE2)  
+    set(${VAR} SSE2)
  endif()

  if(EIGEN_TEST_OPENMP)
    if (${VAR} STREQUAL "")
-	  set(${VAR} OMP)
-	else()
-	  set(${VAR} ${${VAR}}-OMP)
-	endif()
+      set(${VAR} OMP)
+    else()
+      set(${VAR} ${${VAR}}-OMP)
+    endif()
  endif()
-  
+
  if(EIGEN_DEFAULT_TO_ROW_MAJOR)
    if (${VAR} STREQUAL "")
-	  set(${VAR} ROW)
-	else()
-	  set(${VAR} ${${VAR}}-ROWMAJ)
-	endif()  
+      set(${VAR} ROW)
+    else()
+      set(${VAR} ${${VAR}}-ROWMAJ)
+    endif()
  endif()
 endmacro(ei_get_cxxflags)

 macro(ei_set_build_string)
  ei_get_compilerver(LOCAL_COMPILER_VERSION)
  ei_get_cxxflags(LOCAL_COMPILER_FLAGS)
-  
+
  include(EigenDetermineOSVersion)
  DetermineOSVersion(OS_VERSION)

@@ -672,11 +672,11 @@ macro(ei_set_build_string)
  else()
    set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-64bit)
  endif()
-  
+
  if(EIGEN_TEST_CXX11)
    set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-cxx11)
  endif()
-  
+
  if(EIGEN_BUILD_STRING_SUFFIX)
    set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-${EIGEN_BUILD_STRING_SUFFIX})
  endif()
--- a/cmake/FindComputeCpp.cmake
+++ b/cmake/FindComputeCpp.cmake
@@ -1,6 +1,21 @@
 #.rst:
 # FindComputeCpp
 #---------------
+#
+#   Copyright 2016 Codeplay Software Ltd.
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use these files except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.

 #########################
 #  FindComputeCpp.cmake
@@ -8,6 +23,11 @@
 #
 #  Tools for finding and building with ComputeCpp.
 #
+#  User must define COMPUTECPP_PACKAGE_ROOT_DIR pointing to the ComputeCpp
+#   installation.
+#
+#  Latest version of this file can be found at:
+#    https://github.com/codeplaysoftware/computecpp-sdk

 # Require CMake version 3.2.2 or higher
 cmake_minimum_required(VERSION 3.2.2)
@@ -32,7 +52,6 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
      message(FATAL_ERROR
        "host compiler - Not found! (clang version must be at least 3.6)")
    else()
-			set(COMPUTECPP_DISABLE_GCC_DUAL_ABI "True")
      message(STATUS "host compiler - clang ${CMAKE_CXX_COMPILER_VERSION}")
    endif()
 else()
@@ -48,11 +67,12 @@ mark_as_advanced(COMPUTECPP_64_BIT_CODE)
 # Find OpenCL package
 find_package(OpenCL REQUIRED)

-# Find ComputeCpp package
-if(EXISTS ${COMPUTECPP_PACKAGE_ROOT_DIR})
-  message(STATUS "ComputeCpp package - Found (${COMPUTECPP_PACKAGE_ROOT_DIR})")
+# Find ComputeCpp packagee
+if(NOT COMPUTECPP_PACKAGE_ROOT_DIR)
+  message(FATAL_ERROR
+    "ComputeCpp package - Not found! (please set COMPUTECPP_PACKAGE_ROOT_DIR")
 else()
-  message(FATAL_ERROR "ComputeCpp package - Not found! (please set COMPUTECPP_PACKAGE_ROOT_DIR) (${COMPUTECPP_PACKAGE_ROOT_DIR})")
+  message(STATUS "ComputeCpp package - Found")
 endif()
 option(COMPUTECPP_PACKAGE_ROOT_DIR "Path to the ComputeCpp Package")

@@ -61,9 +81,9 @@ find_program(COMPUTECPP_DEVICE_COMPILER compute++ PATHS
  ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
 if (EXISTS ${COMPUTECPP_DEVICE_COMPILER})
  mark_as_advanced(COMPUTECPP_DEVICE_COMPILER)
-  message(STATUS "compute++ - Found (${COMPUTECPP_PACKAGE_ROOT_DIR})")
+  message(STATUS "compute++ - Found")
 else()
-  message(FATAL_ERROR "compute++ - Not found! (${COMPUTECPP_DEVICE_COMPILER}) (${COMPUTECPP_PACKAGE_ROOT_DIR})")
+  message(FATAL_ERROR "compute++ - Not found! (${COMPUTECPP_DEVICE_COMPILER})")
 endif()

 # Obtain the path to computecpp_info
@@ -71,9 +91,9 @@ find_program(COMPUTECPP_INFO_TOOL computecpp_info PATHS
  ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
 if (EXISTS ${COMPUTECPP_INFO_TOOL})
  mark_as_advanced(${COMPUTECPP_INFO_TOOL})
-  message(STATUS "computecpp_info - Found (${COMPUTECPP_PACKAGE_ROOT_DIR})")
+  message(STATUS "computecpp_info - Found")
 else()
-  message(FATAL_ERROR "computecpp_info - Not found! (${COMPUTECPP_INFO_TOOL}) (${COMPUTECPP_PACKAGE_ROOT_DIR})")
+  message(FATAL_ERROR "computecpp_info - Not found! (${COMPUTECPP_INFO_TOOL})")
 endif()

 # Obtain the path to the ComputeCpp runtime library
@@ -85,15 +105,15 @@ if (EXISTS ${COMPUTECPP_RUNTIME_LIBRARY})
  mark_as_advanced(COMPUTECPP_RUNTIME_LIBRARY)
  message(STATUS "libComputeCpp.so - Found")
 else()
-  message(FATAL_ERROR "libComputeCpp.so - Not found! (${COMPUTECPP_PACKAGE_ROOT_DIR})")
+  message(FATAL_ERROR "libComputeCpp.so - Not found!")
 endif()

 # Obtain the ComputeCpp include directory
 set(COMPUTECPP_INCLUDE_DIRECTORY ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/)
 if (NOT EXISTS ${COMPUTECPP_INCLUDE_DIRECTORY})
-  message(FATAL_ERROR "ComputeCpp includes - Not found! (${COMPUTECPP_PACKAGE_ROOT_DIR}/include/)")
+  message(FATAL_ERROR "ComputeCpp includes - Not found!")
 else()
-  message(STATUS "ComputeCpp includes - Found (${COMPUTECPP_PACKAGE_ROOT_DIR})")
+  message(STATUS "ComputeCpp includes - Found")
 endif()

 # Obtain the package version
@@ -144,7 +164,7 @@ endif()
 #
 #  targetName : Name of the target.
 #  sourceFile : Source file to be compiled.
-#  binaryDir  : Intermediate output directory for the integration header.
+#  binaryDir : Intermediate directory to output the integration header.
 #
 function(__build_spir targetName sourceFile binaryDir)

@@ -176,12 +196,13 @@ function(__build_spir targetName sourceFile binaryDir)
    OUTPUT ${outputSyclFile}
    COMMAND ${COMPUTECPP_DEVICE_COMPILER}
            ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
-            -I${COMPUTECPP_INCLUDE_DIRECTORY}
+            -isystem ${COMPUTECPP_INCLUDE_DIRECTORY}
            ${COMPUTECPP_PLATFORM_SPECIFIC_ARGS}
            ${device_compiler_includes}
            -o ${outputSyclFile}
            -c ${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}
    DEPENDS ${sourceFile}
+    WORKING_DIRECTORY ${binaryDir}
  COMMENT "Building ComputeCpp integration header file ${outputSyclFile}")

  # Add a custom target for the generated integration header
@@ -190,10 +211,6 @@ function(__build_spir targetName sourceFile binaryDir)
  # Add a dependency on the integration header
  add_dependencies(${targetName} ${targetName}_integration_header)

-  # Force inclusion of the integration header for the host compiler
-  #set(compileFlags -include ${include_file} "-Wall")
-  target_compile_options(${targetName} PUBLIC ${compileFlags})
-
  # Set the host compiler C++ standard to C++11
  set_property(TARGET ${targetName} PROPERTY CXX_STANDARD 11)

@@ -210,11 +227,11 @@ endfunction()
 #######################
 #
 #  Adds a SYCL compilation custom command associated with an existing
-#  target and sets a dependency on that new command.
+#  target and sets a dependancy on that new command.
 #
 #  targetName : Name of the target to add a SYCL to.
 #  sourceFile : Source file to be compiled for SYCL.
-#  binaryDir  : Intermediate output directory for the integration header.
+#  binaryDir : Intermediate directory to output the integration header.
 #
 function(add_sycl_to_target targetName sourceFile binaryDir)

--- a/doc/AsciiQuickReference.txt
+++ b/doc/AsciiQuickReference.txt
@@ -36,7 +36,7 @@ A.fill(10);       // Fill A with all 10's.
 MatrixXd::Identity(rows,cols)               // eye(rows,cols)
 C.setIdentity(rows,cols)                    // C = eye(rows,cols)
 MatrixXd::Zero(rows,cols)                   // zeros(rows,cols)
-C.setZero(rows,cols)                        // C = ones(rows,cols)
+C.setZero(rows,cols)                        // C = zeros(rows,cols)
 MatrixXd::Ones(rows,cols)                   // ones(rows,cols)
 C.setOnes(rows,cols)                        // C = ones(rows,cols)
 MatrixXd::Random(rows,cols)                 // rand(rows,cols)*2-1            // MatrixXd::Random returns uniform random numbers in (-1, 1).
@@ -84,7 +84,7 @@ P.bottomRightCorner<rows,cols>()   // P(end-rows+1:end, end-cols+1:end)

 // Of particular note is Eigen's swap function which is highly optimized.
 // Eigen                           // Matlab
-R.row(i) = P.col(j);               // R(i, :) = P(:, i)
+R.row(i) = P.col(j);               // R(i, :) = P(:, j)
 R.col(j1).swap(mat1.col(j2));      // R(:, [j1 j2]) = R(:, [j2, j1])

 // Views, transpose, etc;
--- a/doc/CoeffwiseMathFunctionsTable.dox
+++ b/doc/CoeffwiseMathFunctionsTable.dox
@@ -366,7 +366,7 @@ This also means that, unless specified, if the function \c std::foo is available
 <tr>
  <td class="code">
  \anchor cwisetable_isfinite
-  a.\link ArrayBase::isfinite isfinite\endlink(); \n
+  a.\link ArrayBase::isFinite isFinite\endlink(); \n
  \link Eigen::isfinite isfinite\endlink(a);
  </td>
  <td>checks if the given number has finite value</td>
@@ -377,7 +377,7 @@ This also means that, unless specified, if the function \c std::foo is available
 <tr>
  <td class="code">
  \anchor cwisetable_isinf
-  a.\link ArrayBase::isinf isinf\endlink(); \n
+  a.\link ArrayBase::isInf isInf\endlink(); \n
  \link Eigen::isinf isinf\endlink(a);
  </td>
  <td>checks if the given number is infinite</td>
@@ -388,7 +388,7 @@ This also means that, unless specified, if the function \c std::foo is available
 <tr>
  <td class="code">
  \anchor cwisetable_isnan
-  a.\link ArrayBase::isnan isnan\endlink(); \n
+  a.\link ArrayBase::isNaN isNaN\endlink(); \n
  \link Eigen::isnan isnan\endlink(a);
  </td>
  <td>checks if the given number is not a number</td>
@@ -399,7 +399,7 @@ This also means that, unless specified, if the function \c std::foo is available
 <tr>
 <th colspan="4">Error and gamma functions</th>
 </tr>
-<tr> <td colspan="4">  Require \c #include \c <unsupported/Eigen/SpecialFunctions> </td></tr>
+<tr> <td colspan="4">  Require \c \#include \c <unsupported/Eigen/SpecialFunctions> </td></tr>
 <tr>
  <td class="code">
  \anchor cwisetable_erf
@@ -478,7 +478,7 @@ This also means that, unless specified, if the function \c std::foo is available
 <tr>
 <th colspan="4">Special functions</th>
 </tr>
-<tr> <td colspan="4">  Require \c #include \c <unsupported/Eigen/SpecialFunctions> </td></tr>
+<tr> <td colspan="4">  Require \c \#include \c <unsupported/Eigen/SpecialFunctions> </td></tr>
 <tr>
  <td class="code">
  \anchor cwisetable_polygamma
@@ -522,4 +522,4 @@ This also means that, unless specified, if the function \c std::foo is available

 */

-}
+}
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -727,7 +727,8 @@ RECURSIVE              = YES
 # Note that relative paths are relative to the directory from which doxygen is
 # run.

-EXCLUDE                = "${Eigen_SOURCE_DIR}/Eigen/Eigen2Support" \
+EXCLUDE                = "${Eigen_SOURCE_DIR}/Eigen/src/Core/products" \
+                         "${Eigen_SOURCE_DIR}/Eigen/Eigen2Support" \
                         "${Eigen_SOURCE_DIR}/Eigen/src/Eigen2Support" \
                         "${Eigen_SOURCE_DIR}/doc/examples" \
                         "${Eigen_SOURCE_DIR}/doc/special_examples" \
@@ -1591,9 +1592,13 @@ PREDEFINED             = EIGEN_EMPTY_STRUCT \
                         EIGEN_STRONG_INLINE=inline \
                         EIGEN_DEVICE_FUNC= \
                         "EIGEN_MAKE_CWISE_BINARY_OP(METHOD,FUNCTOR)=template<typename OtherDerived> const CwiseBinaryOp<FUNCTOR<Scalar>, const Derived, const OtherDerived> METHOD(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const;" \
-                         "EIGEN_CWISE_PRODUCT_RETURN_TYPE(LHS,RHS)=CwiseBinaryOp<internal::scalar_product_op<typename LHS::Scalar, typename RHS::Scalar >, const LHS, const RHS>"\
+                         "EIGEN_CWISE_PRODUCT_RETURN_TYPE(LHS,RHS)=CwiseBinaryOp<internal::scalar_product_op<LHS::Scalar,RHS::Scalar>, const LHS, const RHS>"\
+                         "EIGEN_CAT2(a,b)= a ## b"\
+                         "EIGEN_CAT(a,b)=EIGEN_CAT2(a,b)"\
+                         "EIGEN_CWISE_BINARY_RETURN_TYPE(LHS,RHS,OPNAME)=CwiseBinaryOp<EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)<LHS::Scalar, RHS::Scalar>, const LHS, const RHS>"\
                         DOXCOMMA=,

+
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
 # this tag can be used to specify a list of macro names that should be expanded.
 # The macro definition that is found in the sources will be used.
@@ -1617,6 +1622,7 @@ EXPAND_AS_DEFINED      = EIGEN_MAKE_TYPEDEFS \
                         EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL \
                         EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF

+
 # If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
 # doxygen's preprocessor will remove all references to function-like macros
 # that are alone on a line, have an all uppercase name, and do not end with a
--- a/doc/Manual.dox
+++ b/doc/Manual.dox
@@ -26,6 +26,7 @@ namespace Eigen {
  - \subpage TopicPitfalls
  - \subpage TopicTemplateKeyword
  - \subpage UserManual_UnderstandingEigen
+  - \subpage TopicCMakeGuide
 */

 /** \page UserManual_UnderstandingEigen Understanding Eigen
--- a/doc/PreprocessorDirectives.dox
+++ b/doc/PreprocessorDirectives.dox
@@ -97,35 +97,39 @@ run time. However, these assertions do cost time and can thus be turned off.

 \section TopicPreprocessorDirectivesPerformance Alignment, vectorization and performance tweaking

- - \b EIGEN_MALLOC_ALREADY_ALIGNED - Can be set to 0 or 1 to tell whether default system \c malloc already
+ - \b \c EIGEN_MALLOC_ALREADY_ALIGNED - Can be set to 0 or 1 to tell whether default system \c malloc already
   returns aligned buffers. In not defined, then this information is automatically deduced from the compiler
   and system preprocessor tokens.
- - \b EIGEN_DONT_ALIGN - disables alignment completely. %Eigen will not try to align its objects and does not
-   expect that any objects passed to it are aligned. This will turn off vectorization. Not defined by default.
- - \b EIGEN_DONT_ALIGN_STATICALLY - disables alignment of arrays on the stack. Not defined by default, unless
-   \c EIGEN_DONT_ALIGN is defined.
- - \b EIGEN_DONT_PARALLELIZE - if defined, this disables multi-threading. This is only relevant if you enabled OpenMP.
+ - \b \c EIGEN_MAX_ALIGN_BYTES - Must be a power of two, or 0. Defines an upper bound on the memory boundary in bytes on which dynamically and statically allocated data may be aligned by %Eigen. If not defined, a default value is automatically computed based on architecture, compiler, and OS.
+ This option is typically used to enforce binary compatibility between code/libraries compiled with different SIMD options. For instance, one may compile AVX code and enforce ABI compatibility with existing SSE code by defining \c EIGEN_MAX_ALIGN_BYTES=16. In the other way round, since by default AVX implies 32 bytes alignment for best performance, one can compile SSE code to be ABI compatible with AVX code by defining \c EIGEN_MAX_ALIGN_BYTES=32.
+ - \b \c EIGEN_MAX_STATIC_ALIGN_BYTES - Same as \c EIGEN_MAX_ALIGN_BYTES but for statically allocated data only. By default, if only  \c EIGEN_MAX_ALIGN_BYTES is defined, then \c EIGEN_MAX_STATIC_ALIGN_BYTES == \c EIGEN_MAX_ALIGN_BYTES, otherwise a default value is automatically computed based on architecture, compiler, and OS (can be smaller than the default value of EIGEN_MAX_ALIGN_BYTES on architectures that do not support stack alignment).
+ Let us emphasize that \c EIGEN_MAX_*_ALIGN_BYTES define only a diserable upper bound. In practice data is aligned to largest power-of-two common divisor of \c EIGEN_MAX_STATIC_ALIGN_BYTES and the size of the data, such that memory is not wasted.
+ - \b \c EIGEN_DONT_PARALLELIZE - if defined, this disables multi-threading. This is only relevant if you enabled OpenMP.
   See \ref TopicMultiThreading for details.
 - \b EIGEN_DONT_VECTORIZE - disables explicit vectorization when defined. Not defined by default, unless 
   alignment is disabled by %Eigen's platform test or the user defining \c EIGEN_DONT_ALIGN.
- - \b EIGEN_UNALIGNED_VECTORIZE - disables/enables vectorization with unaligned stores. Default is 1 (enabled).
+ - \b \c EIGEN_UNALIGNED_VECTORIZE - disables/enables vectorization with unaligned stores. Default is 1 (enabled).
   If set to 0 (disabled), then expression for which the destination cannot be aligned are not vectorized (e.g., unaligned
   small fixed size vectors or matrices)
- - \b EIGEN_FAST_MATH - enables some optimizations which might affect the accuracy of the result. This currently
+ - \b \c EIGEN_FAST_MATH - enables some optimizations which might affect the accuracy of the result. This currently
   enables the SSE vectorization of sin() and cos(), and speedups sqrt() for single precision. Defined to 1 by default.
   Define it to 0 to disable.
- - \b EIGEN_UNROLLING_LIMIT - defines the size of a loop to enable meta unrolling. Set it to zero to disable
+ - \b \c EIGEN_UNROLLING_LIMIT - defines the size of a loop to enable meta unrolling. Set it to zero to disable
   unrolling. The size of a loop here is expressed in %Eigen's own notion of "number of FLOPS", it does not
   correspond to the number of iterations or the number of instructions. The default is value 100.
- - \b EIGEN_STACK_ALLOCATION_LIMIT - defines the maximum bytes for a buffer to be allocated on the stack. For internal
+ - \b \c EIGEN_STACK_ALLOCATION_LIMIT - defines the maximum bytes for a buffer to be allocated on the stack. For internal
   temporary buffers, dynamic memory allocation is employed as a fall back. For fixed-size matrices or arrays, exceeding
   this threshold raises a compile time assertion. Use 0 to set no limit. Default is 128 KB.


+ - \c EIGEN_DONT_ALIGN - Deprecated, it is a synonym for \c EIGEN_MAX_ALIGN_BYTES=0. It disables alignment completely. %Eigen will not try to align its objects and does not expect that any objects passed to it are aligned. This will turn off vectorization if \b EIGEN_UNALIGNED_VECTORIZE=1. Not defined by default.
+ - \c EIGEN_DONT_ALIGN_STATICALLY - Deprecated, it is a synonym for \c EIGEN_MAX_STATIC_ALIGN_BYTES=0. It disables alignment of arrays on the stack. Not defined by default, unless \c EIGEN_DONT_ALIGN is defined.
+
+
 \section TopicPreprocessorDirectivesPlugins Plugins

 It is possible to add new methods to many fundamental classes in %Eigen by writing a plugin. As explained in
-the section \ref ExtendingMatrixBase, the plugin is specified by defining a \c EIGEN_xxx_PLUGIN macro. The
+the section \ref TopicCustomizing_Plugins, the plugin is specified by defining a \c EIGEN_xxx_PLUGIN macro. The
 following macros are supported; none of them are defined by default.

 - \b EIGEN_ARRAY_PLUGIN - filename of plugin for extending the Array class.
--- a/doc/QuickReference.dox
+++ b/doc/QuickReference.dox
@@ -340,7 +340,7 @@ mat1 = mat2.adjoint();        mat1.adjointInPlace();
 \endcode
 </td></tr>
 <tr><td>
-\link MatrixBase::dot() dot \endlink product \n inner product \matrixworld</td><td>\code
+\link MatrixBase::dot dot \endlink product \n inner product \matrixworld</td><td>\code
 scalar = vec1.dot(vec2);
 scalar = col1.adjoint() * col2;
 scalar = (col1.adjoint() * col2).value();\endcode
--- a/doc/TopicCMakeGuide.dox
+++ b/doc/TopicCMakeGuide.dox
@@ -0,0 +1,52 @@
+namespace Eigen {
+
+/**
+
+\page TopicCMakeGuide Using %Eigen in CMake Projects
+
+%Eigen provides native CMake support which allows the library to be easily
+used in CMake projects.
+
+\note %CMake 3.0 (or later) is required to enable this functionality.
+
+%Eigen exports a CMake target called `Eigen3::Eigen` which can be imported
+using the `find_package` CMake command and used by calling
+`target_link_libraries` as in the following example:
+\code{.cmake}
+cmake_minimum_required (VERSION 3.0)
+project (myproject)
+
+find_package (Eigen3 3.3 REQUIRED NO_MODULE)
+
+add_executable (example example.cpp)
+target_link_libraries (example Eigen3::Eigen)
+\endcode
+
+The above code snippet must be placed in a file called `CMakeLists.txt` alongside
+`example.cpp`. After running
+\code{.sh}
+$ cmake path-to-example-directory
+\endcode
+CMake will produce project files that generate an executable called `example`
+which requires at least version 3.3 of %Eigen. Here, `path-to-example-directory`
+is the path to the directory that contains both `CMakeLists.txt` and
+`example.cpp`.
+
+If you have multiple installed version of %Eigen, you can pick your favorite one by setting the \c Eigen3_DIR cmake's variable to the respective path containing the \c Eigen3*.cmake files. For instance:
+\code
+cmake path-to-example-directory -DEigen3_DIR=$HOME/mypackages/share/eigen3/cmake/
+\endcode
+
+If the `REQUIRED` option is omitted when locating %Eigen using
+`find_package`, one can check whether the package was found as follows:
+\code{.cmake}
+find_package (Eigen3 3.3 NO_MODULE)
+
+if (TARGET Eigen3::Eigen)
+  # Use the imported target
+endif (TARGET Eigen3::Eigen)
+\endcode
+
+*/
+
+}
--- a/doc/snippets/Cwise_boolean_xor.cpp
+++ b/doc/snippets/Cwise_boolean_xor.cpp
@@ -0,0 +1,2 @@
+Array3d v(-1,2,1), w(-3,2,3);
+cout << ((v<w) ^ (v<0)) << endl;
--- a/doc/snippets/MatrixBase_selfadjointView.cpp
+++ b/doc/snippets/MatrixBase_selfadjointView.cpp
@@ -0,0 +1,6 @@
+Matrix3i m = Matrix3i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is the symmetric matrix extracted from the upper part of m:" << endl
+     << Matrix3i(m.selfadjointView<Upper>()) << endl;
+cout << "Here is the symmetric matrix extracted from the lower part of m:" << endl
+     << Matrix3i(m.selfadjointView<Lower>()) << endl;
--- a/test/array_for_matrix.cpp
+++ b/test/array_for_matrix.cpp
@@ -134,6 +134,12 @@ template<typename MatrixType> void comparisons(const MatrixType& m)
  // count
  VERIFY(((m1.array().abs()+1)>RealScalar(0.1)).count() == rows*cols);

+  // and/or
+  VERIFY( ((m1.array()<RealScalar(0)).matrix() && (m1.array()>RealScalar(0)).matrix()).count() == 0);
+  VERIFY( ((m1.array()<RealScalar(0)).matrix() || (m1.array()>=RealScalar(0)).matrix()).count() == rows*cols);
+  RealScalar a = m1.cwiseAbs().mean();
+  VERIFY( ((m1.array()<-a).matrix() || (m1.array()>a).matrix()).count() == (m1.cwiseAbs().array()>a).count());
+
  typedef Matrix<typename MatrixType::Index, Dynamic, 1> VectorOfIndices;

  // TODO allows colwise/rowwise for array
--- a/test/geo_hyperplane.cpp
+++ b/test/geo_hyperplane.cpp
@@ -66,12 +66,15 @@ template<typename HyperplaneType> void hyperplane(const HyperplaneType& _plane)
    VERIFY_IS_MUCH_SMALLER_THAN( pl2.transform(rot,Isometry).absDistance(rot * p1), Scalar(1) );
    pl2 = pl1;
    VERIFY_IS_MUCH_SMALLER_THAN( pl2.transform(rot*scaling).absDistance((rot*scaling) * p1), Scalar(1) );
+    VERIFY_IS_APPROX( pl2.normal().norm(), RealScalar(1) );
    pl2 = pl1;
    VERIFY_IS_MUCH_SMALLER_THAN( pl2.transform(rot*scaling*translation)
                                  .absDistance((rot*scaling*translation) * p1), Scalar(1) );
+    VERIFY_IS_APPROX( pl2.normal().norm(), RealScalar(1) );
    pl2 = pl1;
    VERIFY_IS_MUCH_SMALLER_THAN( pl2.transform(rot*translation,Isometry)
                                 .absDistance((rot*translation) * p1), Scalar(1) );
+    VERIFY_IS_APPROX( pl2.normal().norm(), RealScalar(1) );
  }

  // casting
--- a/test/main.h
+++ b/test/main.h
@@ -90,7 +90,7 @@ inline void on_temporary_creation(long int size) {
 #define VERIFY_EVALUATION_COUNT(XPR,N) {\
    nb_temporaries = 0; \
    XPR; \
-    if(nb_temporaries!=N) std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; \
+    if(nb_temporaries!=N) { std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; }\
    VERIFY( (#XPR) && nb_temporaries==N ); \
  }
  
@@ -372,10 +372,10 @@ inline bool test_isApproxOrLessThan(const half& a, const half& b)

 // test_relative_error returns the relative difference between a and b as a real scalar as used in isApprox.
 template<typename T1,typename T2>
-typename T1::RealScalar test_relative_error(const EigenBase<T1> &a, const EigenBase<T2> &b)
+typename NumTraits<typename T1::RealScalar>::NonInteger test_relative_error(const EigenBase<T1> &a, const EigenBase<T2> &b)
 {
  using std::sqrt;
-  typedef typename T1::RealScalar RealScalar;
+  typedef typename NumTraits<typename T1::RealScalar>::NonInteger RealScalar;
  typename internal::nested_eval<T1,2>::type ea(a.derived());
  typename internal::nested_eval<T2,2>::type eb(b.derived());
  return sqrt(RealScalar((ea-eb).cwiseAbs2().sum()) / RealScalar((std::min)(eb.cwiseAbs2().sum(),ea.cwiseAbs2().sum())));
@@ -433,9 +433,9 @@ typename T1::RealScalar test_relative_error(const SparseMatrixBase<T1> &a, const
 }

 template<typename T1,typename T2>
-typename NumTraits<T1>::Real test_relative_error(const T1 &a, const T2 &b, typename internal::enable_if<internal::is_arithmetic<typename NumTraits<T1>::Real>::value, T1>::type* = 0)
+typename NumTraits<typename NumTraits<T1>::Real>::NonInteger test_relative_error(const T1 &a, const T2 &b, typename internal::enable_if<internal::is_arithmetic<typename NumTraits<T1>::Real>::value, T1>::type* = 0)
 {
-  typedef typename NumTraits<T1>::Real RealScalar; 
+  typedef typename NumTraits<typename NumTraits<T1>::Real>::NonInteger RealScalar;
  return numext::sqrt(RealScalar(numext::abs2(a-b))/RealScalar((numext::mini)(numext::abs2(a),numext::abs2(b))));
 }

--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -591,7 +591,7 @@ template<typename Scalar> void packetmath_scatter_gather()
  int stride = internal::random<int>(1,20);

  EIGEN_ALIGN_MAX Scalar buffer[PacketSize*20];
-  memset(buffer, 0, 20*sizeof(Packet));
+  memset(buffer, 0, 20*PacketSize*sizeof(Scalar));
  Packet packet = internal::pload<Packet>(data1);
  internal::pscatter<Scalar, Packet>(buffer, packet, stride);

--- a/test/permutationmatrices.cpp
+++ b/test/permutationmatrices.cpp
@@ -108,7 +108,14 @@ template<typename MatrixType> void permutationmatrices(const MatrixType& m)
    rm = rp;
    rm.col(i).swap(rm.col(j));
    VERIFY_IS_APPROX(rm, rp2.toDenseMatrix().template cast<Scalar>());
-  }  
+  }
+
+  {
+    // simple compilation check
+    Matrix<Scalar, Cols, Cols> A = rp;
+    Matrix<Scalar, Cols, Cols> B = rp.transpose();
+    VERIFY_IS_APPROX(A, B.transpose());
+  }
 }

 template<typename T>
--- a/test/product_extra.cpp
+++ b/test/product_extra.cpp
@@ -98,6 +98,16 @@ template<typename MatrixType> void product_extra(const MatrixType& m)
  // regression test
  MatrixType tmp = m1 * m1.adjoint() * s1;
  VERIFY_IS_APPROX(tmp, m1 * m1.adjoint() * s1);
+
+  // regression test for bug 1343, assignment to arrays
+  Array<Scalar,Dynamic,1> a1 = m1 * vc2;
+  VERIFY_IS_APPROX(a1.matrix(),m1*vc2);
+  Array<Scalar,Dynamic,1> a2 = s1 * (m1 * vc2);
+  VERIFY_IS_APPROX(a2.matrix(),s1*m1*vc2);
+  Array<Scalar,1,Dynamic> a3 = v1 * m1;
+  VERIFY_IS_APPROX(a3.matrix(),v1*m1);
+  Array<Scalar,Dynamic,Dynamic> a4 = m1 * m2.adjoint();
+  VERIFY_IS_APPROX(a4.matrix(),m1*m2.adjoint());
 }

 // Regression test for bug reported at http://forum.kde.org/viewtopic.php?f=74&t=96947
--- a/test/product_mmtr.cpp
+++ b/test/product_mmtr.cpp
@@ -62,6 +62,19 @@ template<typename Scalar> void mmtr(int size)
  CHECK_MMTR(matc, Upper, -= (s*sqc).template triangularView<Upper>()*sqc);
  CHECK_MMTR(matc, Lower, = (s*sqr).template triangularView<Lower>()*sqc);
  CHECK_MMTR(matc, Upper, += (s*sqc).template triangularView<Lower>()*sqc);
+
+  // check aliasing
+  ref2 = ref1 = matc;
+  ref1 = sqc.adjoint() * matc * sqc;
+  ref2.template triangularView<Upper>() = ref1.template triangularView<Upper>();
+  matc.template triangularView<Upper>() = sqc.adjoint() * matc * sqc;
+  VERIFY_IS_APPROX(matc, ref2);
+
+  ref2 = ref1 = matc;
+  ref1 = sqc * matc * sqc.adjoint();
+  ref2.template triangularView<Lower>() = ref1.template triangularView<Lower>();
+  matc.template triangularView<Lower>() = sqc * matc * sqc.adjoint();
+  VERIFY_IS_APPROX(matc, ref2);
 }

 void test_product_mmtr()
--- a/test/product_notemporary.cpp
+++ b/test/product_notemporary.cpp
@@ -136,6 +136,10 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
  VERIFY_EVALUATION_COUNT( rm3.noalias() -= (cv1) * (rv1 * m1), 1 );
  VERIFY_EVALUATION_COUNT( rm3.noalias() = (m1*cv1) * (rv1 * m1), 2 );
  VERIFY_EVALUATION_COUNT( rm3.noalias() += (m1*cv1) * (rv1 * m1), 2 );
+
+  // Check nested products
+  VERIFY_EVALUATION_COUNT( cvres.noalias() = m1.adjoint() * m1 * cv1, 1 );
+  VERIFY_EVALUATION_COUNT( rvres.noalias() = rv1 * (m1 * m2.adjoint()), 1 );
 }

 void test_product_notemporary()
--- a/test/product_symm.cpp
+++ b/test/product_symm.cpp
@@ -39,6 +39,24 @@ template<typename Scalar, int Size, int OtherSize> void symm(int size = Size, in
  VERIFY_IS_APPROX(rhs12 = (s1*m2).template selfadjointView<Lower>() * (s2*rhs1),
                   rhs13 = (s1*m1) * (s2*rhs1));

+  VERIFY_IS_APPROX(rhs12 = (s1*m2).transpose().template selfadjointView<Upper>() * (s2*rhs1),
+                   rhs13 = (s1*m1.transpose()) * (s2*rhs1));
+
+  VERIFY_IS_APPROX(rhs12 = (s1*m2).template selfadjointView<Lower>().transpose() * (s2*rhs1),
+                   rhs13 = (s1*m1.transpose()) * (s2*rhs1));
+
+  VERIFY_IS_APPROX(rhs12 = (s1*m2).conjugate().template selfadjointView<Lower>() * (s2*rhs1),
+                   rhs13 = (s1*m1).conjugate() * (s2*rhs1));
+
+  VERIFY_IS_APPROX(rhs12 = (s1*m2).template selfadjointView<Lower>().conjugate() * (s2*rhs1),
+                   rhs13 = (s1*m1).conjugate() * (s2*rhs1));
+
+  VERIFY_IS_APPROX(rhs12 = (s1*m2).adjoint().template selfadjointView<Upper>() * (s2*rhs1),
+                   rhs13 = (s1*m1).adjoint() * (s2*rhs1));
+
+  VERIFY_IS_APPROX(rhs12 = (s1*m2).template selfadjointView<Lower>().adjoint() * (s2*rhs1),
+                   rhs13 = (s1*m1).adjoint() * (s2*rhs1));
+
  m2 = m1.template triangularView<Upper>(); rhs12.setRandom(); rhs13 = rhs12;
  m3 = m2.template selfadjointView<Upper>();
  VERIFY_IS_EQUAL(m1, m3);
--- a/test/selfadjoint.cpp
+++ b/test/selfadjoint.cpp
@@ -21,7 +21,9 @@ template<typename MatrixType> void selfadjoint(const MatrixType& m)
  Index cols = m.cols();

  MatrixType m1 = MatrixType::Random(rows, cols),
-             m3(rows, cols);
+             m2 = MatrixType::Random(rows, cols),
+             m3(rows, cols),
+             m4(rows, cols);

  m1.diagonal() = m1.diagonal().real().template cast<Scalar>();

@@ -30,10 +32,19 @@ template<typename MatrixType> void selfadjoint(const MatrixType& m)
  VERIFY_IS_APPROX(MatrixType(m3.template triangularView<Upper>()), MatrixType(m1.template triangularView<Upper>()));
  VERIFY_IS_APPROX(m3, m3.adjoint());

-
  m3 = m1.template selfadjointView<Lower>();
  VERIFY_IS_APPROX(MatrixType(m3.template triangularView<Lower>()), MatrixType(m1.template triangularView<Lower>()));
  VERIFY_IS_APPROX(m3, m3.adjoint());
+
+  m3 = m1.template selfadjointView<Upper>();
+  m4 = m2;
+  m4 += m1.template selfadjointView<Upper>();
+  VERIFY_IS_APPROX(m4, m2+m3);
+
+  m3 = m1.template selfadjointView<Lower>();
+  m4 = m2;
+  m4 -= m1.template selfadjointView<Lower>();
+  VERIFY_IS_APPROX(m4, m2-m3);
 }

 void bug_159()
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -25,6 +25,7 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
  //const Index outer = ref.outerSize();

  typedef typename SparseMatrixType::Scalar Scalar;
+  typedef typename SparseMatrixType::RealScalar RealScalar;
  enum { Flags = SparseMatrixType::Flags };

  double density = (std::max)(8./(rows*cols), 0.01);
@@ -193,6 +194,17 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
    VERIFY_IS_APPROX(m3 + refM4, refM3 + refM4);
    VERIFY_IS_APPROX(refM4 - m3, refM4 - refM3);
    VERIFY_IS_APPROX(m3 - refM4, refM3 - refM4);
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + RealScalar(0.5)*m3).eval(), RealScalar(0.5)*refM4 + RealScalar(0.5)*refM3);
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + m3*RealScalar(0.5)).eval(), RealScalar(0.5)*refM4 + RealScalar(0.5)*refM3);
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + m3.cwiseProduct(m3)).eval(), RealScalar(0.5)*refM4 + refM3.cwiseProduct(refM3));
+
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + RealScalar(0.5)*m3).eval(), RealScalar(0.5)*refM4 + RealScalar(0.5)*refM3);
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + m3*RealScalar(0.5)).eval(), RealScalar(0.5)*refM4 + RealScalar(0.5)*refM3);
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + (m3+m3)).eval(), RealScalar(0.5)*refM4 + (refM3+refM3));
+    VERIFY_IS_APPROX(((refM3+m3)+RealScalar(0.5)*m3).eval(), RealScalar(0.5)*refM3 + (refM3+refM3));
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + (refM3+m3)).eval(), RealScalar(0.5)*refM4 + (refM3+refM3));
+    VERIFY_IS_APPROX((RealScalar(0.5)*refM4 + (m3+refM3)).eval(), RealScalar(0.5)*refM4 + (refM3+refM3));
+

    VERIFY_IS_APPROX(m1.sum(), refM1.sum());

@@ -217,6 +229,51 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
          refM1(it.row(), it.col()) += s1;
      VERIFY_IS_APPROX(m1, refM1);
    }
+
+    // and/or
+    {
+      typedef SparseMatrix<bool, SparseMatrixType::Options, typename SparseMatrixType::StorageIndex> SpBool;
+      SpBool mb1 = m1.real().template cast<bool>();
+      SpBool mb2 = m2.real().template cast<bool>();
+      VERIFY_IS_EQUAL(mb1.template cast<int>().sum(), refM1.real().template cast<bool>().count());
+      VERIFY_IS_EQUAL((mb1 && mb2).template cast<int>().sum(), (refM1.real().template cast<bool>() && refM2.real().template cast<bool>()).count());
+      VERIFY_IS_EQUAL((mb1 || mb2).template cast<int>().sum(), (refM1.real().template cast<bool>() || refM2.real().template cast<bool>()).count());
+      SpBool mb3 = mb1 && mb2;
+      if(mb1.coeffs().all() && mb2.coeffs().all())
+      {
+        VERIFY_IS_EQUAL(mb3.nonZeros(), (refM1.real().template cast<bool>() && refM2.real().template cast<bool>()).count());
+      }
+    }
+  }
+
+  // test reverse iterators
+  {
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    std::vector<Scalar> ref_value(m2.innerSize());
+    std::vector<Index> ref_index(m2.innerSize());
+    if(internal::random<bool>())
+      m2.makeCompressed();
+    for(Index j = 0; j<m2.outerSize(); ++j)
+    {
+      Index count_forward = 0;
+
+      for(typename SparseMatrixType::InnerIterator it(m2,j); it; ++it)
+      {
+        ref_value[ref_value.size()-1-count_forward] = it.value();
+        ref_index[ref_index.size()-1-count_forward] = it.index();
+        count_forward++;
+      }
+      Index count_reverse = 0;
+      for(typename SparseMatrixType::ReverseInnerIterator it(m2,j); it; --it)
+      {
+        VERIFY_IS_APPROX( std::abs(ref_value[ref_value.size()-count_forward+count_reverse])+1, std::abs(it.value())+1);
+        VERIFY_IS_EQUAL( ref_index[ref_index.size()-count_forward+count_reverse] , it.index());
+        count_reverse++;
+      }
+      VERIFY_IS_EQUAL(count_forward, count_reverse);
+    }
  }

  // test transpose
@@ -386,6 +443,14 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
    m3 = m2.template selfadjointView<Lower>();
    VERIFY_IS_APPROX(m3, refMat3);

+    refMat3 += refMat2.template selfadjointView<Lower>();
+    m3 += m2.template selfadjointView<Lower>();
+    VERIFY_IS_APPROX(m3, refMat3);
+
+    refMat3 -= refMat2.template selfadjointView<Lower>();
+    m3 -= m2.template selfadjointView<Lower>();
+    VERIFY_IS_APPROX(m3, refMat3);
+
    // selfadjointView only works for square matrices:
    SparseMatrixType m4(rows, rows+1);
    VERIFY_RAISES_ASSERT(m4.template selfadjointView<Lower>());
--- a/test/sparse_block.cpp
+++ b/test/sparse_block.cpp
@@ -9,6 +9,20 @@

 #include "sparse.h"

+template<typename T>
+typename Eigen::internal::enable_if<(T::Flags&RowMajorBit)==RowMajorBit, typename T::RowXpr>::type
+innervec(T& A, Index i)
+{
+  return A.row(i);
+}
+
+template<typename T>
+typename Eigen::internal::enable_if<(T::Flags&RowMajorBit)==0, typename T::ColXpr>::type
+innervec(T& A, Index i)
+{
+  return A.col(i);
+}
+
 template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& ref)
 {
  const Index rows = ref.rows();
@@ -20,9 +34,10 @@ template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& re
  typedef typename SparseMatrixType::StorageIndex StorageIndex;

  double density = (std::max)(8./(rows*cols), 0.01);
-  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+  typedef Matrix<Scalar,Dynamic,Dynamic,SparseMatrixType::IsRowMajor?RowMajor:ColMajor> DenseMatrix;
  typedef Matrix<Scalar,Dynamic,1> DenseVector;
  typedef Matrix<Scalar,1,Dynamic> RowDenseVector;
+  typedef SparseVector<Scalar> SparseVectorType;

  Scalar s1 = internal::random<Scalar>();
  {
@@ -110,15 +125,35 @@ template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& re
    initSparse<Scalar>(density, refMat2, m2);
    Index j0 = internal::random<Index>(0,outer-1);
    Index j1 = internal::random<Index>(0,outer-1);
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.innerVector(j0), refMat2.row(j0));
-    else
-      VERIFY_IS_APPROX(m2.innerVector(j0), refMat2.col(j0));
+    Index r0 = internal::random<Index>(0,rows-1);
+    Index c0 = internal::random<Index>(0,cols-1);

-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), refMat2.row(j0)+refMat2.row(j1));
-    else
-      VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), refMat2.col(j0)+refMat2.col(j1));
+    VERIFY_IS_APPROX(m2.innerVector(j0), innervec(refMat2,j0));
+    VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), innervec(refMat2,j0)+innervec(refMat2,j1));
+
+    m2.innerVector(j0) *= Scalar(2);
+    innervec(refMat2,j0) *= Scalar(2);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    m2.row(r0) *= Scalar(3);
+    refMat2.row(r0) *= Scalar(3);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    m2.col(c0) *= Scalar(4);
+    refMat2.col(c0) *= Scalar(4);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    m2.row(r0) /= Scalar(3);
+    refMat2.row(r0) /= Scalar(3);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    m2.col(c0) /= Scalar(4);
+    refMat2.col(c0) /= Scalar(4);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    SparseVectorType v1;
+    VERIFY_IS_APPROX(v1 = m2.col(c0) * 4, refMat2.col(c0)*4);
+    VERIFY_IS_APPROX(v1 = m2.row(r0) * 4, refMat2.row(r0).transpose()*4);

    SparseMatrixType m3(rows,cols);
    m3.reserve(VectorXi::Constant(outer,int(inner/2)));
@@ -223,6 +258,33 @@ template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& re

    VERIFY_IS_APPROX(m2.block(r0,c0,r1,c1), refMat2.block(r0,c0,r1,c1));
    VERIFY_IS_APPROX((2*m2).block(r0,c0,r1,c1), (2*refMat2).block(r0,c0,r1,c1));
+
+    if(m2.nonZeros()>0)
+    {
+      VERIFY_IS_APPROX(m2, refMat2);
+      SparseMatrixType m3(rows, cols);
+      DenseMatrix refMat3(rows, cols); refMat3.setZero();
+      Index n = internal::random<Index>(1,10);
+      for(Index k=0; k<n; ++k)
+      {
+        Index o1 = internal::random<Index>(0,outer-1);
+        Index o2 = internal::random<Index>(0,outer-1);
+        if(SparseMatrixType::IsRowMajor)
+        {
+          m3.innerVector(o1) = m2.row(o2);
+          refMat3.row(o1) = refMat2.row(o2);
+        }
+        else
+        {
+          m3.innerVector(o1) = m2.col(o2);
+          refMat3.col(o1) = refMat2.col(o2);
+        }
+        if(internal::random<bool>())
+          m3.makeCompressed();
+      }
+      if(m3.nonZeros()>0)
+      VERIFY_IS_APPROX(m3, refMat3);
+    }
  }
 }

--- a/test/sparse_solver.h
+++ b/test/sparse_solver.h
@@ -272,6 +272,7 @@ template<typename Solver> void check_sparse_spd_solving(Solver& solver, int maxS
  typedef typename Mat::Scalar Scalar;
  typedef typename Mat::StorageIndex StorageIndex;
  typedef SparseMatrix<Scalar,ColMajor, StorageIndex> SpMat;
+  typedef SparseVector<Scalar, 0, StorageIndex> SpVec;
  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
  typedef Matrix<Scalar,Dynamic,1> DenseVector;

@@ -288,6 +289,8 @@ template<typename Solver> void check_sparse_spd_solving(Solver& solver, int maxS
    DenseVector b = DenseVector::Random(size);
    DenseMatrix dB(size,rhsCols);
    initSparse<Scalar>(density, dB, B, ForceNonZeroDiag);
+    SpVec c = B.col(0);
+    DenseVector dc = dB.col(0);
  
    CALL_SUBTEST( check_sparse_solving(solver, A,     b,  dA, b)  );
    CALL_SUBTEST( check_sparse_solving(solver, halfA, b,  dA, b)  );
@@ -295,6 +298,8 @@ template<typename Solver> void check_sparse_spd_solving(Solver& solver, int maxS
    CALL_SUBTEST( check_sparse_solving(solver, halfA, dB, dA, dB) );
    CALL_SUBTEST( check_sparse_solving(solver, A,     B,  dA, dB) );
    CALL_SUBTEST( check_sparse_solving(solver, halfA, B,  dA, dB) );
+    CALL_SUBTEST( check_sparse_solving(solver, A,     c,  dA, dc) );
+    CALL_SUBTEST( check_sparse_solving(solver, halfA, c,  dA, dc) );
    
    // check only once
    if(i==0)
@@ -396,6 +401,7 @@ template<typename Solver> void check_sparse_square_solving(Solver& solver, int m
  typedef typename Solver::MatrixType Mat;
  typedef typename Mat::Scalar Scalar;
  typedef SparseMatrix<Scalar,ColMajor, typename Mat::StorageIndex> SpMat;
+  typedef SparseVector<Scalar, 0, typename Mat::StorageIndex> SpVec;
  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
  typedef Matrix<Scalar,Dynamic,1> DenseVector;

@@ -413,9 +419,12 @@ template<typename Solver> void check_sparse_square_solving(Solver& solver, int m
    double density = (std::max)(8./(size*rhsCols), 0.1);
    initSparse<Scalar>(density, dB, B, ForceNonZeroDiag);
    B.makeCompressed();
+    SpVec c = B.col(0);
+    DenseVector dc = dB.col(0);
    CALL_SUBTEST(check_sparse_solving(solver, A, b,  dA, b));
    CALL_SUBTEST(check_sparse_solving(solver, A, dB, dA, dB));
    CALL_SUBTEST(check_sparse_solving(solver, A, B,  dA, dB));
+    CALL_SUBTEST(check_sparse_solving(solver, A, c,  dA, dc));
    
    // check only once
    if(i==0)
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -13,6 +13,18 @@

 #include "../../../Eigen/Core"

+#ifdef EIGEN_USE_SYCL
+#undef min
+#undef max
+#undef isnan
+#undef isinf
+#undef isfinite
+#include <SYCL/sycl.hpp>
+#include <map>
+#include <memory>
+#include <utility>
+#endif
+
 #include <Eigen/src/Core/util/DisableStupidWarnings.h>

 #include "../SpecialFunctions"
@@ -69,10 +81,6 @@ typedef unsigned __int64 uint64_t;
 #endif
 #endif

-#ifdef EIGEN_USE_SYCL
-#include <SYCL/sycl.hpp>
-#endif
-
 #include "src/Tensor/TensorMacros.h"
 #include "src/Tensor/TensorForwardDeclarations.h"
 #include "src/Tensor/TensorMeta.h"
@@ -81,7 +89,6 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorDeviceDefault.h"
 #include "src/Tensor/TensorDeviceThreadPool.h"
 #include "src/Tensor/TensorDeviceCuda.h"
-#include "src/Tensor/TensorSycl.h"
 #include "src/Tensor/TensorDeviceSycl.h"
 #include "src/Tensor/TensorIndexList.h"
 #include "src/Tensor/TensorDimensionList.h"
@@ -128,6 +135,7 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorAssign.h"
 #include "src/Tensor/TensorScan.h"

+#include "src/Tensor/TensorSycl.h"
 #include "src/Tensor/TensorExecutor.h"
 #include "src/Tensor/TensorDevice.h"

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -1,12 +1,11 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
-// Cummins Chris PhD student at The University of Edinburgh.
 // Contact: <eigen@codeplay.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>

 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -17,106 +16,107 @@
 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H

 namespace Eigen {
-/// \struct BufferT is used to specialise add_sycl_buffer function for
-//  two types of buffer we have. When the MapAllocator is true, we create the
-//  sycl buffer with MapAllocator.
-/// We have to const_cast the input pointer in order to work around the fact
-/// that sycl does not accept map allocator for const pointer.
-template <typename T, bool MapAllocator>
-struct BufferT {
-  using Type = cl::sycl::buffer<T, 1, cl::sycl::map_allocator<T>>;
-  static inline void add_sycl_buffer(
-      const T *ptr, size_t num_bytes,
-      std::map<const void *, std::shared_ptr<void>> &buffer_map) {
-    buffer_map.insert(std::pair<const void *, std::shared_ptr<void>>(
-        ptr, std::shared_ptr<void>(std::make_shared<Type>(
-                 Type(const_cast<T *>(ptr), cl::sycl::range<1>(num_bytes))))));
-  }
-};
-
-/// specialisation of the \ref BufferT when the MapAllocator is false. In this
-/// case we only create the device-only buffer.
-template <typename T>
-struct BufferT<T, false> {
-  using Type = cl::sycl::buffer<T, 1>;
-  static inline void add_sycl_buffer(
-      const T *ptr, size_t num_bytes,
-      std::map<const void *, std::shared_ptr<void>> &buffer_map) {
-    buffer_map.insert(std::pair<const void *, std::shared_ptr<void>>(
-        ptr, std::shared_ptr<void>(
-                 std::make_shared<Type>(Type(cl::sycl::range<1>(num_bytes))))));
-  }
-};
-
 struct SyclDevice {
  /// class members
  /// sycl queue
-  cl::sycl::queue &m_queue;
+  mutable cl::sycl::queue m_queue;
  /// std::map is the container used to make sure that we create only one buffer
-  /// per pointer. The lifespan of the buffer
-  /// now depends on the lifespan of SyclDevice. If a non-read-only pointer is
-  /// needed to be accessed on the host we should manually deallocate it.
+  /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice.
+  /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it.
  mutable std::map<const void *, std::shared_ptr<void>> buffer_map;
-
-  SyclDevice(cl::sycl::queue &q) : m_queue(q) {}
+  /// creating device by using selector
+  template<typename dev_Selector> SyclDevice(dev_Selector s)
+  :
+#ifdef EIGEN_EXCEPTIONS
+  m_queue(cl::sycl::queue(s, [=](cl::sycl::exception_list l) {
+    for (const auto& e : l) {
+      try {
+        std::rethrow_exception(e);
+      } catch (cl::sycl::exception e) {
+          std::cout << e.what() << std::endl;
+        }
+    }
+  }))
+#else
+  m_queue(cl::sycl::queue(s))
+#endif
+  {}
  // destructor
  ~SyclDevice() { deallocate_all(); }

-  template <typename T>
-  void deallocate(const T *p) const {
+  template <typename T> void deallocate(T *p) const {
    auto it = buffer_map.find(p);
    if (it != buffer_map.end()) {
      buffer_map.erase(it);
+      internal::aligned_free(p);
    }
  }
-  void deallocate_all() const { buffer_map.clear(); }
+  void deallocate_all() const {
+    std::map<const void *, std::shared_ptr<void>>::iterator it=buffer_map.begin();
+    while (it!=buffer_map.end()) {
+      auto p=it->first;
+      buffer_map.erase(it);
+      internal::aligned_free(const_cast<void*>(p));
+      it=buffer_map.begin();
+    }
+    buffer_map.clear();
+  }

  /// creation of sycl accessor for a buffer. This function first tries to find
-  /// the buffer in the buffer_map.
-  /// If found it gets the accessor from it, if not, the function then adds an
-  /// entry by creating a sycl buffer
-  /// for that particular pointer.
-  template <cl::sycl::access::mode AcMd, bool MapAllocator, typename T>
-  inline cl::sycl::accessor<T, 1, AcMd, cl::sycl::access::target::global_buffer>
-  get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh,
-                    const T *ptr) const {
-    auto it = buffer_map.find(ptr);
-    if (it == buffer_map.end()) {
-      BufferT<T, MapAllocator>::add_sycl_buffer(ptr, num_bytes, buffer_map);
-    }
-    return (
-        ((typename BufferT<T, MapAllocator>::Type *)(buffer_map.at(ptr).get()))
-            ->template get_access<AcMd>(cgh));
+  /// the buffer in the buffer_map. If found it gets the accessor from it, if not,
+  ///the function then adds an entry by creating a sycl buffer for that particular pointer.
+  template <cl::sycl::access::mode AcMd, typename T> inline cl::sycl::accessor<T, 1, AcMd, cl::sycl::access::target::global_buffer>
+  get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, const T * ptr) const {
+    return (get_sycl_buffer<T>(num_bytes, ptr)->template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh));
+  }
+
+  template<typename T> inline  std::pair<std::map<const void *, std::shared_ptr<void>>::iterator,bool> add_sycl_buffer(const T *ptr, size_t num_bytes) const {
+    using Type = cl::sycl::buffer<T, 1>;
+    std::pair<std::map<const void *, std::shared_ptr<void>>::iterator,bool> ret = buffer_map.insert(std::pair<const void *, std::shared_ptr<void>>(ptr, std::shared_ptr<void>(new Type(cl::sycl::range<1>(num_bytes)),
+      [](void *dataMem) { delete static_cast<Type*>(dataMem); })));
+    (static_cast<Type*>(buffer_map.at(ptr).get()))->set_final_data(nullptr);
+    return ret;
+  }
+
+  template <typename T> inline cl::sycl::buffer<T, 1>* get_sycl_buffer(size_t num_bytes,const T * ptr) const {
+    return static_cast<cl::sycl::buffer<T, 1>*>(add_sycl_buffer(ptr, num_bytes).first->second.get());
  }

  /// allocating memory on the cpu
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
-    return internal::aligned_malloc(num_bytes);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t) const {
+    return internal::aligned_malloc(8);
  }

  // some runtime conditions that can be applied here
  bool isDeviceSuitable() const { return true; }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void *buffer) const {
-    internal::aligned_free(buffer);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src,
-                                                    size_t n) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
    ::memcpy(dst, src, n);
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(
-      void *dst, const void *src, size_t n) const {
-    memcpy(dst, src, n);
+
+  template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const {
+    auto host_acc= (static_cast<cl::sycl::buffer<T, 1>*>(add_sycl_buffer(dst, n).first->second.get()))-> template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>();
+    memcpy(host_acc.get_pointer(), src, n);
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(
-      void *dst, const void *src, size_t n) const {
-    memcpy(dst, src, n);
+ /// whith the current implementation of sycl, the data is copied twice from device to host. This will be fixed soon.
+  template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(T *dst, const T *src, size_t n) const {
+    auto it = buffer_map.find(src);
+    if (it != buffer_map.end()) {
+      auto host_acc= (static_cast<cl::sycl::buffer<T, 1>*>(it->second.get()))-> template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::host_buffer>();
+      memcpy(dst,host_acc.get_pointer(),  n);
+    } else{
+      eigen_assert("no device memory found. The memory might be destroyed before creation");
+    }
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c,
-                                                    size_t n) const {
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c, size_t n) const {
    ::memset(buffer, c, n);
  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+  return 1;
+  }
 };
+
 }  // end namespace Eigen

 #endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -47,13 +47,13 @@ struct traits<TensorEvalToOp<XprType, MakePointer_> >
 template<typename XprType, template <class> class MakePointer_>
 struct eval<TensorEvalToOp<XprType, MakePointer_>, Eigen::Dense>
 {
-  typedef const TensorEvalToOp<XprType>& type;
+  typedef const TensorEvalToOp<XprType, MakePointer_>& type;
 };

 template<typename XprType, template <class> class MakePointer_>
 struct nested<TensorEvalToOp<XprType, MakePointer_>, 1, typename eval<TensorEvalToOp<XprType, MakePointer_> >::type>
 {
-  typedef TensorEvalToOp<XprType> type;
+  typedef TensorEvalToOp<XprType, MakePointer_> type;
 };

 }  // end namespace internal
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -33,7 +33,7 @@ template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp;
 template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp;
 template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> class TensorCwiseTernaryOp;
 template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp;
-template<typename Op, typename Dims, typename XprType> class TensorReductionOp;
+template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_ = MakePointer > class TensorReductionOp;
 template<typename XprType> class TensorIndexTupleOp;
 template<typename ReduceOp, typename Dims, typename XprType> class TensorTupleReducerOp;
 template<typename Axis, typename LeftXprType, typename RightXprType> class TensorConcatenationOp;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -20,8 +21,8 @@ namespace Eigen {
  */

 namespace internal {
-template<typename Op, typename Dims, typename XprType>
-struct traits<TensorReductionOp<Op, Dims, XprType> >
+  template<typename Op, typename Dims, typename XprType,template <class> class MakePointer_ >
+  struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> >
 : traits<XprType>
 {
  typedef traits<XprType> XprTraits;
@@ -31,18 +32,24 @@ struct traits<TensorReductionOp<Op, Dims, XprType> >
  typedef typename XprType::Nested Nested;
  static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
  static const int Layout = XprTraits::Layout;
+
+  template <class T> struct MakePointer {
+    // Intermediate typedef to workaround MSVC issue.
+    typedef MakePointer_<T> MakePointerT;
+    typedef typename MakePointerT::Type Type;
+  };
 };

-template<typename Op, typename Dims, typename XprType>
-struct eval<TensorReductionOp<Op, Dims, XprType>, Eigen::Dense>
+template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
+struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense>
 {
-  typedef const TensorReductionOp<Op, Dims, XprType>& type;
+  typedef const TensorReductionOp<Op, Dims, XprType, MakePointer_>& type;
 };

-template<typename Op, typename Dims, typename XprType>
-struct nested<TensorReductionOp<Op, Dims, XprType>, 1, typename eval<TensorReductionOp<Op, Dims, XprType> >::type>
+template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
+struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1, typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type>
 {
-  typedef TensorReductionOp<Op, Dims, XprType> type;
+  typedef TensorReductionOp<Op, Dims, XprType, MakePointer_> type;
 };


@@ -339,8 +346,8 @@ __global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnTy
 }  // end namespace internal


-template <typename Op, typename Dims, typename XprType>
-class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType>, ReadOnlyAccessors> {
+template <typename Op, typename Dims, typename XprType,  template <class> class MakePointer_>
+class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> {
  public:
    typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
@@ -371,18 +378,19 @@ class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType>


 // Eval as rvalue
-template<typename Op, typename Dims, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
 {
-  typedef TensorReductionOp<Op, Dims, ArgType> XprType;
+  typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
  typedef typename XprType::Index Index;
+  typedef ArgType ChildType;
  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
  static const int NumInputDims = internal::array_size<InputDimensions>::value;
  static const int NumReducedDims = internal::array_size<Dims>::value;
  static const int NumOutputDims = NumInputDims - NumReducedDims;
  typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions;
  typedef typename XprType::Scalar Scalar;
-  typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> Self;
+  typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
  static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
@@ -401,7 +409,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
  static const bool RunningFullReduction = (NumOutputDims==0);

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
+      : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims())
  {
    EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
    EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
@@ -471,25 +479,35 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }

-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
    m_impl.evalSubExprsIfNeeded(NULL);

    // Use the FullReducer if possible.
-    if (RunningFullReduction &&
+    if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction &&
        internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
        ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
-         !RunningOnGPU)) {
+         !RunningOnGPU))) {
      bool need_assign = false;
      if (!data) {
        m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
        data = m_result;
        need_assign = true;
      }
-
      Op reducer(m_reducer);
      internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
      return need_assign;
    }
+    else if(RunningOnSycl){
+      const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+      const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+      if (!data) {
+        data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+        m_result = data;
+      }
+      Op reducer(m_reducer);
+      internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+      return (m_result != NULL);
+    }

    // Attempt to use an optimized reduction.
    else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) {
@@ -572,7 +590,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
  {
-    if ((RunningFullReduction || RunningOnGPU) && m_result) {
+    if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) {
      return *(m_result + index);
    }
    Op reducer(m_reducer);
@@ -644,7 +662,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
    }
  }

-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC typename MakePointer_<Scalar>::Type data() const { return m_result; }
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  /// added for sycl in order to construct the buffer from the sycl device
+  const Device& device() const{return m_device;}
+  /// added for sycl in order to re-construct the reduction eval on the device for the sub-kernel
+  const Dims& xprDims() const {return m_xpr_dims;}
+

  private:
  template <int, typename, typename> friend struct internal::GenericDimReducer;
@@ -737,12 +762,18 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
  // For full reductions
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
  static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
+  static const bool RunningOnSycl = false;
+#elif defined(EIGEN_USE_SYCL)
+static const bool RunningOnSycl = internal::is_same<typename internal::remove_all<Device>::type, Eigen::SyclDevice>::value;
+static const bool RunningOnGPU = false;
 #else
  static const bool RunningOnGPU = false;
+  static const bool RunningOnSycl = false;
 #endif
-  CoeffReturnType* m_result;
+  typename MakePointer_<CoeffReturnType>::Type m_result;

  const Device& m_device;
+  const Dims& m_xpr_dims;
 };

 } // end namespace Eigen
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -0,0 +1,242 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorSyclPlaceHolderExpr.h
+ *
+ * \brief:
+ *  This is the specialisation of the placeholder expression based on the
+ * operation type
+ *
+*****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
+
+namespace Eigen {
+namespace internal {
+
+template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
+template<typename BufferTOut, typename BufferTIn>
+static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
+  do {
+          auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
+            cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
+                                    cl::sycl::range<1>{std::min(length, local)}};
+            /* Two accessors are used: one to the buffer that is being reduced,
+             * and a second to local memory, used to store intermediate data. */
+            auto aI =
+                bufI.template get_access<cl::sycl::access::mode::read_write>(h);
+            auto aOut =
+                bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
+            cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
+                               cl::sycl::access::target::local>
+                scratch(cl::sycl::range<1>(local), h);
+
+            /* The parallel_for invocation chosen is the variant with an nd_item
+             * parameter, since the code requires barriers for correctness. */
+            h.parallel_for<KernelName>(
+                r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
+                  size_t globalid = id.get_global(0);
+                  size_t localid = id.get_local(0);
+                  /* All threads collectively read from global memory into local.
+                   * The barrier ensures all threads' IO is resolved before
+                   * execution continues (strictly speaking, all threads within
+                   * a single work-group - there is no co-ordination between
+                   * work-groups, only work-items). */
+                  if (globalid < length) {
+                    scratch[localid] = aI[globalid];
+                  }
+                  id.barrier(cl::sycl::access::fence_space::local_space);
+
+                  /* Apply the reduction operation between the current local
+                   * id and the one on the other half of the vector. */
+                  if (globalid < length) {
+                    int min = (length < local) ? length : local;
+                    for (size_t offset = min / 2; offset > 0; offset /= 2) {
+                      if (localid < offset) {
+                        scratch[localid] += scratch[localid + offset];
+                      }
+                      id.barrier(cl::sycl::access::fence_space::local_space);
+                    }
+                    /* The final result will be stored in local id 0. */
+                    if (localid == 0) {
+                      aI[id.get_group(0)] = scratch[localid];
+                      if((length<=local) && globalid ==0){
+                        aOut[globalid]=scratch[localid];
+                      }
+                    }
+                  }
+                });
+          };
+            dev.m_queue.submit(f);
+            dev.m_queue.throw_asynchronous();
+
+          /* At this point, you could queue::wait_and_throw() to ensure that
+           * errors are caught quickly. However, this would likely impact
+           * performance negatively. */
+          length = length / local;
+
+        } while (length > 1);
+
+
+
+}
+
+};
+
+/// For now let's start with a full reducer
+/// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
+/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
+/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
+// a leafNode.
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
+
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  static const bool HasOptimizedImplementation = false;
+
+  static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
+    typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
+    typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+    auto functors = TensorSycl::internal::extractFunctors(self.impl());
+    int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
+    size_t inputSize =self.impl().dimensions().TotalSize();
+    size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
+    size_t remaining = inputSize% red_factor;
+    if(rng ==0) {
+      red_factor=1;
+    };
+    size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
+    size_t GRange=std::max((size_t )1, rng);
+
+    // convert global range to power of 2 for redecution
+    GRange--;
+    GRange |= GRange >> 1;
+    GRange |= GRange >> 2;
+    GRange |= GRange >> 4;
+    GRange |= GRange >> 8;
+    GRange |= GRange >> 16;
+#if __x86_64__ || __ppc64__ || _WIN64
+    GRange |= GRange >> 32;
+#endif
+    GRange++;
+    size_t  outTileSize = tileSize;
+    /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
+    if (GRange < outTileSize) outTileSize=GRange;
+    // getting final out buffer at the moment the created buffer is true because there is no need for assign
+    auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output);
+    /// creating the shared memory for calculating reduction.
+    /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
+    /// recursively apply reduction on it in order to reduce the whole.
+    auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
+    typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
+    Dims dims= self.xprDims();
+    Op functor = reducer;
+    dev.m_queue.submit([&](cl::sycl::handler &cgh) {
+      // create a tuple of accessors from Evaluator
+      auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
+      auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
+
+      cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
+        typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
+        auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+        /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
+        /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+        /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
+        const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
+        /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
+        /// the device_evaluator is detectable and recognisable on the device.
+        auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+        /// const cast added as a naive solution to solve the qualifier drop error
+        auto globalid=itemID.get_global_linear_id();
+
+        if(globalid<rng)
+          tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor));
+        else
+          tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
+
+        if(remaining!=0 && globalid==0 )
+          // this will add the rest of input buffer when the input size is not devidable to red_factor.
+          tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor));
+      });
+    });
+  dev.m_queue.throw_asynchronous();
+
+/// This is used to recursively reduce the tmp value to an element of 1;
+  syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
+  }
+
+};
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
+
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  static const bool HasOptimizedImplementation = false;
+
+  static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
+    typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
+    typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+    auto functors = TensorSycl::internal::extractFunctors(self.impl());
+
+    size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
+
+    size_t GRange=num_coeffs_to_preserve;
+    if (tileSize>GRange) tileSize=GRange;
+    else if(GRange>tileSize){
+      size_t xMode = GRange % tileSize;
+      if (xMode != 0) GRange += (tileSize - xMode);
+    }
+    // getting final out buffer at the moment the created buffer is true because there is no need for assign
+    /// creating the shared memory for calculating reduction.
+    /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
+    /// recursively apply reduction on it in order to reduce the whole.
+    typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
+    Dims dims= self.xprDims();
+    Op functor = reducer;
+
+    dev.m_queue.submit([&](cl::sycl::handler &cgh) {
+      // create a tuple of accessors from Evaluator
+      auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
+      auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);
+
+      cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
+        typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
+        auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+        /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
+        /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+        /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
+        const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
+        /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
+        /// the device_evaluator is detectable and recognisable on the device.
+        typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf;
+        auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+        /// const cast added as a naive solution to solve the qualifier drop error
+        auto globalid=itemID.get_global_linear_id();
+        if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
+          typename DeiceSelf::CoeffReturnType accum = functor.initialize();
+          GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum);
+          functor.finalize(accum);
+          output_accessor.get_pointer()[globalid]= accum;
+        }
+      });
+    });
+  dev.m_queue.throw_asynchronous();
+    return false;
+  }
+};
+
+}  // end namespace internal
+}  // namespace Eigen
+
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
@@ -22,6 +22,13 @@ struct MakeGlobalPointer {
  typedef typename cl::sycl::global_ptr<T>::pointer_t Type;
 };

+// global pointer to set different attribute state for a class
+template <class T>
+struct MakeLocalPointer {
+  typedef typename cl::sycl::local_ptr<T>::pointer_t Type;
+};
+
+
 namespace Eigen {
 namespace TensorSycl {
 namespace internal {
@@ -43,9 +50,7 @@ template<typename T> struct GetType<false, T>{
 // tuple construction
 #include "TensorSyclTuple.h"

-// This file contains the PlaceHolder that replaces the actual data
-#include "TensorSyclPlaceHolder.h"
-
+// counting number of leaf at compile time
 #include "TensorSyclLeafCount.h"

 // The index PlaceHolder takes the actual expression and replaces the actual
@@ -57,9 +62,6 @@ template<typename T> struct GetType<false, T>{
 // creation of an accessor tuple from a tuple of SYCL buffers
 #include "TensorSyclExtractAccessor.h"

-// actual data extraction using accessors
-//#include "GetDeviceData.h"
-
 // this is used to change the address space type in tensor map for GPU
 #include "TensorSyclConvertToDeviceExpression.h"

@@ -70,6 +72,9 @@ template<typename T> struct GetType<false, T>{
 // this is used to construct the expression on the device
 #include "TensorSyclExprConstructor.h"

+/// this is used for extracting tensor reduction
+#include "TensorReductionSycl.h"
+
 // kernel execution using fusion
 #include "TensorSyclRun.h"

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
@@ -102,6 +102,18 @@ KERNELBROKERCONVERT(, false, TensorForcedEvalOp)
 KERNELBROKERCONVERT(const, true, TensorEvalToOp)
 KERNELBROKERCONVERT(, false, TensorEvalToOp)
 #undef KERNELBROKERCONVERT
+
+/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp
+#define KERNELBROKERCONVERTREDUCTION(CVQual)\
+template <typename OP, typename Dim, typename subExpr, template <class> class MakePointer_>\
+struct ConvertToDeviceExpression<CVQual TensorReductionOp<OP, Dim, subExpr, MakePointer_> > {\
+  typedef CVQual TensorReductionOp<OP, Dim, typename ConvertToDeviceExpression<subExpr>::Type, MakeGlobalPointer> Type;\
+};
+
+KERNELBROKERCONVERTREDUCTION(const)
+KERNELBROKERCONVERTREDUCTION()
+#undef KERNELBROKERCONVERTREDUCTION
+
 }  // namespace internal
 }  // namespace TensorSycl
 }  // namespace Eigen
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
@@ -33,8 +33,7 @@ struct EvalToLHSConstructor {
  EvalToLHSConstructor(const utility::tuple::Tuple<Params...> &t): expr((&(*(utility::tuple::get<N>(t).get_pointer())))) {}
 };

-/// \struct ExprConstructor is used to reconstruct the expression on the device
-/// and
+/// \struct ExprConstructor is used to reconstruct the expression on the device and
 /// recreate the expression with MakeGlobalPointer containing the device address
 /// space for the TensorMap pointers used in eval function.
 /// It receives the original expression type, the functor of the node, the tuple
@@ -49,7 +48,7 @@ struct ExprConstructor;
 template <typename Scalar_, int Options_, int Options2_, int Options3_, int NumIndices_, typename IndexType_,\
 template <class> class MakePointer_, size_t N, typename... Params>\
 struct ExprConstructor< CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer>,\
-CVQual Eigen::internal::PlaceHolder<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options3_, MakePointer_>, N>, Params...>{\
+CVQual PlaceHolder<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options3_, MakePointer_>, N>, Params...>{\
  typedef  CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer>  Type;\
  Type expr;\
  template <typename FuncDetector>\
@@ -187,7 +186,7 @@ EVALTO()
 #define FORCEDEVAL(CVQual)\
 template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
 struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr, MakeGlobalPointer>,\
-CVQual Eigen::internal::PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
+CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
  typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::Scalar,\
  TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::NumDimensions, 0, typename TensorForcedEvalOp<DevExpr>::Index>, 0, MakeGlobalPointer> Type;\
  Type expr;\
@@ -200,14 +199,41 @@ FORCEDEVAL(const)
 FORCEDEVAL()
 #undef FORCEDEVAL

+template <bool Conds,  size_t X , size_t Y > struct ValueCondition {
+  static const size_t Res =X;
+};
+template<size_t X, size_t Y> struct ValueCondition<false, X , Y> {
+  static const size_t Res =Y;
+};
+
+/// specialisation of the \ref ExprConstructor struct when the node type is TensorReductionOp
+#define SYCLREDUCTIONEXPR(CVQual)\
+template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
+struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPointer>,\
+CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\
+  static const size_t NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0,  1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
+  typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\
+  NumIndices, 0, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, 0, MakeGlobalPointer> Type;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
+  : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
+};
+
+SYCLREDUCTIONEXPR(const)
+SYCLREDUCTIONEXPR()
+#undef SYCLREDUCTIONEXPR
+
 /// template deduction for \ref ExprConstructor struct
 template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params>
 auto createDeviceExpression(FuncD &funcD, const utility::tuple::Tuple<Params...> &t)
    -> decltype(ExprConstructor<OrigExpr, IndexExpr, Params...>(funcD, t)) {
  return ExprConstructor<OrigExpr, IndexExpr, Params...>(funcD, t);
 }
-}
-}
-}  // namespace Eigen
+
+} /// namespace TensorSycl
+} /// namespace internal
+} /// namespace Eigen
+

 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
@@ -57,9 +57,9 @@ struct AccessorConstructor{
    return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)));
  }
  template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, Arg eval)
-  -> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor<AcM, true,
+  -> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor<AcM,
  typename Eigen::internal::remove_all<typename Arg::CoeffReturnType>::type>(eval.dimensions().TotalSize(), cgh,eval.data()))){
-    return utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM, true, typename Eigen::internal::remove_all<typename Arg::CoeffReturnType>::type>(eval.dimensions().TotalSize(), cgh,eval.data()));
+    return utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM, typename Eigen::internal::remove_all<typename Arg::CoeffReturnType>::type>(eval.dimensions().TotalSize(), cgh,eval.data()));
  }
 };

@@ -73,14 +73,12 @@ struct ExtractAccessor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> >
  }
 };

-/// specialisation of the \ref ExtractAccessor struct when the node type is
-///  TensorCwiseNullaryOp,  TensorCwiseUnaryOp and  TensorBroadcastingOp
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseNullaryOp,  TensorCwiseUnaryOp and  TensorBroadcastingOp
 template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
 struct ExtractAccessor<TensorEvaluator<UnaryCategory<OP, RHSExpr>, Dev> >
 : ExtractAccessor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {};

-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// const TensorCwiseBinaryOp
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorCwiseBinaryOp
 template <template<class, class, class> class BinaryCategory, typename OP,  typename LHSExpr, typename RHSExpr, typename Dev>
 struct ExtractAccessor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> eval)
@@ -88,9 +86,7 @@ struct ExtractAccessor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr
    return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());
  }
 };
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorCwiseBinaryOp
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseBinaryOp
 template <template<class, class, class> class BinaryCategory, typename OP,  typename LHSExpr, typename RHSExpr, typename Dev>
 struct ExtractAccessor<TensorEvaluator<BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >
 : ExtractAccessor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >{};
@@ -105,8 +101,7 @@ struct ExtractAccessor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2E
  }
 };

-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorCwiseTernaryOp
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseTernaryOp
 template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
 struct ExtractAccessor<TensorEvaluator<TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >
 : ExtractAccessor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >{};
@@ -127,8 +122,7 @@ template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
 struct ExtractAccessor<TensorEvaluator<TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >
 : ExtractAccessor<TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >{};

-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// const TensorAssignOp
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorAssignOp
 template <typename LHSExpr, typename RHSExpr, typename Dev>
 struct ExtractAccessor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> eval)
@@ -137,14 +131,12 @@ struct ExtractAccessor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, D
 }
 };

-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorAssignOp
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorAssignOp
 template <typename LHSExpr, typename RHSExpr, typename Dev>
 struct ExtractAccessor<TensorEvaluator<TensorAssignOp<LHSExpr, RHSExpr>, Dev> >
 : ExtractAccessor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> >{};

-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// const TensorMap
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorMap
 #define TENSORMAPEXPR(CVQual, ACCType)\
 template <typename PlainObjectType, int Options_, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> > {\
@@ -157,8 +149,7 @@ TENSORMAPEXPR(const, cl::sycl::access::mode::read)
 TENSORMAPEXPR(, cl::sycl::access::mode::read_write)
 #undef TENSORMAPEXPR

-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// const TensorForcedEvalOp
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorForcedEvalOp
 template <typename Expr, typename Dev>
 struct ExtractAccessor<TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> > {
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> eval)
@@ -167,14 +158,12 @@ struct ExtractAccessor<TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> > {
  }
 };

-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorForcedEvalOp
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorForcedEvalOp
 template <typename Expr, typename Dev>
 struct ExtractAccessor<TensorEvaluator<TensorForcedEvalOp<Expr>, Dev> >
 : ExtractAccessor<TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> >{};

-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// const TensorEvalToOp
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorEvalToOp
 template <typename Expr, typename Dev>
 struct ExtractAccessor<TensorEvaluator<const TensorEvalToOp<Expr>, Dev> > {
  static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<const TensorEvalToOp<Expr>, Dev> eval)
@@ -183,19 +172,33 @@ struct ExtractAccessor<TensorEvaluator<const TensorEvalToOp<Expr>, Dev> > {
  }
 };

-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorEvalToOp
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp
 template <typename Expr, typename Dev>
 struct ExtractAccessor<TensorEvaluator<TensorEvalToOp<Expr>, Dev> >
 : ExtractAccessor<TensorEvaluator<const TensorEvalToOp<Expr>, Dev> >{};

+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorReductionOp
+template <typename OP, typename Dim, typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> eval)
+  -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){
+    return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);
+  }
+};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp
+template <typename OP, typename Dim, typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorReductionOp<OP, Dim, Expr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> >{};
+
 /// template deduction for \ref ExtractAccessor
 template <typename Evaluator>
 auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& expr)
 -> decltype(ExtractAccessor<Evaluator>::getTuple(cgh, expr)) {
  return ExtractAccessor<Evaluator>::getTuple(cgh, expr);
 }
-}
-}
-}
+
+} /// namespace TensorSycl
+} /// namespace internal
+} /// namespace Eigen
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
@@ -141,7 +141,30 @@ template <typename RHSExpr, typename Dev>
 struct FunctorExtractor<TensorEvaluator<TensorEvalToOp<RHSExpr>, Dev> >
 : FunctorExtractor<TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev> > {};

+template<typename Dim, size_t NumOutputDim> struct DimConstr {
+template<typename InDim>
+  static inline Dim getDim(InDim dims ) {return dims;}
+};

+template<typename Dim> struct DimConstr<Dim, 0> {
+  template<typename InDim>
+    static inline Dim getDim(InDim dims ) {return Dim(dims.TotalSize());}
+};
+
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct FunctorExtractor<TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{
+  typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;
+  typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;
+  const Dimensions m_dimensions;
+  const Dimensions& dimensions() const { return m_dimensions; }
+  FunctorExtractor(const TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)
+  : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}
+};
+
+
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct FunctorExtractor<TensorEvaluator<TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>
+: FunctorExtractor<TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{};
 /// template deduction function for FunctorExtractor
 template <typename Evaluator>
 auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> {
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Gael Guennebaud	477d1e8192	Bump to 3.3.2	2017-01-18 15:06:40 +01:00
Gael Guennebaud	0eaff8fdf2	Defer set-to-zero in triangular = product so that no aliasing issue occur in the common: A.triangularView() = BA.sefladjointView()B.adjoint() case that used to work in 3.2. (grafted from `655ba783f8` )	2017-01-17 18:03:35 +01:00
Gael Guennebaud	582c96691b	Fix typo	2017-01-16 13:36:56 +01:00
Gael Guennebaud	0b22158d9f	Add missing doc of SparseView (grafted from `831fffe874` )	2017-01-06 18:01:29 +01:00
Gael Guennebaud	dafdb0d8a8	MSVC 2015 has all we want about c++11 and MSVC 2017 fails on binder1st/binder2nd (grafted from `e383d6159a` )	2017-01-06 15:44:13 +01:00
Gael Guennebaud	1d1686c62b	Convert integers to real numbers when computing relative L2 error (grafted from `f3f026c9aa` )	2017-01-05 13:36:08 +01:00
Gael Guennebaud	ad95b924d0	Fix and workaround several doxygen issues/warnings (grafted from `2299717fd5` )	2017-01-04 23:27:33 +01:00
Gael Guennebaud	9499684320	Add doc for sparse triangular solve functions (grafted from `ee6f7f6c0c` )	2017-01-04 23:10:36 +01:00
Gael Guennebaud	5b6a31626b	Add missing snippet files. (grafted from `5165de97a4` )	2017-01-04 23:08:27 +01:00
Gael Guennebaud	bc3fee2d8e	bug #1336 : workaround doxygen failing to include numerous members of MatriBase in Matrix (grafted from `a0a36ad0ef` )	2017-01-04 22:02:39 +01:00
Gael Guennebaud	eaa9223277	Document selfadjointView (grafted from `29a1a58113` )	2017-01-04 22:01:50 +01:00
Gael Guennebaud	c9ba1165e7	bug #1336 : fix doxygen issue regarding EIGEN_CWISE_BINARY_RETURN_TYPE (grafted from `a5ebc92f8d` )	2017-01-04 18:21:44 +01:00
Gael Guennebaud	dd2d5d67ff	bug #1370 : add doc for StorageIndex (grafted from `8702562177` )	2017-01-03 11:25:41 +01:00
Gael Guennebaud	404322b64f	bug #1370 : rename _Index to _StorageIndex in SparseMatrix, and add a warning in the doc regarding the 3.2 to 3.3 change of SparseMatrix::Index (grafted from `575c078759` )	2017-01-03 11:19:14 +01:00
Marco Falke	ce37bae2cd	doc: Fix trivial typo in AsciiQuickReference.txt * * * fixup! (grafted from `4ebf69394d` )	2017-01-01 13:25:48 +00:00
Gael Guennebaud	3900dbc341	Make sure that traits<CwiseBinaryOp>::Flags reports the correct storage order so that methods like .outerSize()/.innerSize() work properly. (grafted from `d32a43e33a` )	2016-12-27 16:35:45 +01:00
Gael Guennebaud	5f586c2bd0	Add missing .outer() member to iterators of evaluators of cwise sparse binary expression (grafted from `7136267461` )	2016-12-27 16:34:30 +01:00
Gael Guennebaud	215f88a417	Fix check of storage order mismatch for "sparse cwiseop sparse". (grafted from `fe0ee72390` )	2016-12-27 16:33:19 +01:00
Gael Guennebaud	2257f40f4a	Merged in angelos_m/eigen/3.3 (pull request PR-269) Remove superfluous const's (can cause warnings on some Intel compilers)	2016-12-21 08:53:16 +01:00
Gael Guennebaud	9e0fa0ef6d	Fix bug #1367 : compilation fix for gcc 4.1! (grafted from `94e8d8902f` )	2016-12-20 22:17:01 +01:00
Gael Guennebaud	0fddbf3dc7	Add transpose, adjoint, conjugate methods to SelfAdjointView (useful to write generic code) (grafted from `684cfc762d` )	2016-12-20 16:33:53 +01:00
Gael Guennebaud	eda635bd58	Make sure that HyperPlane::transform manitains a unit normal vector in the Affine case. (grafted from `f5d644b415` )	2016-12-20 09:35:00 +01:00
Benoit Jacob	26197bb467	Use 32 registers on ARM64	2016-12-19 13:44:46 -05:00
Gael Guennebaud	772e59d475	bug #1360 : fix sign issue with pmull on altivec (grafted from `8c0e701504` )	2016-12-18 22:13:19 +00:00
Gael Guennebaud	e8f83cbb5d	Fix unused warning (grafted from `fc94258e77` )	2016-12-18 22:11:48 +00:00
Gael Guennebaud	dce584d799	bug #1363 : fix mingw's ABI issue (grafted from `5d00fdf0e8` )	2016-12-15 11:58:31 +01:00
Gael Guennebaud	0bcef9557d	bug #1358 : fix compilation for sparse += sparse.selfadjointView(); (grafted from `11b492e993` )	2016-12-14 17:53:47 +01:00
Gael Guennebaud	2b3c876b2a	bug #1359 : fix compilation of col_major_sparse.row() *= scalar (used to work in 3.2.9 though the expression is not really writable) (grafted from `e67397bfa7` )	2016-12-14 17:05:26 +01:00
Gael Guennebaud	a05f6aad0e	bug #1359 : fix sparse /=scalar and *=scalar implementation. InnerIterators must be obtained from an evaluator. (grafted from `98d7458275` )	2016-12-14 17:03:13 +01:00
Gael Guennebaud	59187285e1	bug #1361 : fix compilation issue in mat=perm.inverse() (grafted from `c817ce3ba3` )	2016-12-13 23:10:27 +01:00
Angelos Mantzaflaris	1dd074ea7e	Merged eigen/eigen/3.3 into 3.3	2016-12-07 01:01:50 +01:00
Angelos Mantzaflaris	24fa7a01bd	merge	2016-12-07 00:43:55 +01:00
Angelos Mantzaflaris	e236d3443c	Remove superfluous const's (can cause warnings on some Intel compilers)	2016-12-07 00:37:48 +01:00
Gael Guennebaud	4ec8833220	Added tag 3.3.1 for changeset `dd3685cc6a`	2016-12-06 11:44:02 +01:00
Gael Guennebaud	dd3685cc6a	Bump to 3.3.1	2016-12-06 11:43:58 +01:00
Gael Guennebaud	487a6e6515	Explain how to choose your favorite Eigen version (grafted from `0c4d05b009` )	2016-12-06 11:34:06 +01:00
Silvio Traversaro	75f0b8aae3	Added relocatable cmake support also for CMake before 3.0 and after 2.8.8 (grafted from `e049a2a72a` )	2016-12-06 10:37:34 +01:00
Gael Guennebaud	23aca8a586	Optimize SparseLU::solve for rhs vectors (grafted from `8640ffac65` )	2016-12-05 15:41:14 +01:00
Gael Guennebaud	28bf2bf070	remove temporary in SparseLU::solve (grafted from `62acd67903` )	2016-12-05 15:11:57 +01:00
Silvio Traversaro	0164f4c682	Make CMake config file relocatable (grafted from `18481b518f` )	2016-12-05 10:39:52 +01:00
Gael Guennebaud	bbff608a42	Merged in angelos_m/eigen/3.3 (pull request PR-264) add explicit template to numext::abs2 and fix signed/unsigned warning	2016-12-05 21:56:01 +00:00
Gael Guennebaud	ea56d2ff2c	Fix memory leak in Ref<Sparse> (grafted from `a6b971e291` )	2016-12-05 16:59:30 +01:00
Gael Guennebaud	a4c8701e9a	bug #1356 : fix calls to evaluator::coeffRef(0,0) to get the address of the destination by adding a dstDataPtr() member to the kernel. This fixes undefined behavior if dst is empty (nullptr). (grafted from `0db6d5b3f4` )	2016-12-05 15:08:09 +01:00
Gael Guennebaud	a9bb9796e0	Ease compiler job to generate clean and efficient code in mat*vec. (grafted from `66f65ccc36` )	2016-12-02 22:41:26 +01:00
Gael Guennebaud	449883be74	Operators += and -= do not resize! (grafted from `fe696022ec` )	2016-12-02 22:40:25 +01:00
Angelos Mantzaflaris	0a08d4c60b	use numext::abs	2016-12-02 11:48:06 +01:00
Angelos Mantzaflaris	4086187e49	1. Add explicit template to abs2 (resolves deduction for some arithmetic types) 2. Avoid signed-unsigned conversion in comparison (warning in case Scalar is unsigned)	2016-12-02 11:39:18 +01:00
Christoph Hertzberg	91864f85d3	bug #1355 : Fixed wrong line-endings on two files (grafted from `22f7d398e2` )	2016-12-02 11:22:05 +01:00
Gael Guennebaud	c3597106ab	Merged in angelos_m/eigen/3.3 (pull request PR-263) fix two warnings(unused typedef, unused variable) and a typo	2016-12-02 09:02:39 +00:00
Gael Guennebaud	aed1d6597f	Clean up SparseCore module regarding ReverseInnerIterator (grafted from `27873008d4` )	2016-12-01 21:55:10 +01:00
Angelos Mantzaflaris	b6f04a2dd4	typo UIntPtr	2016-12-01 21:25:58 +01:00
Angelos Mantzaflaris	a9aa3bcf50	fix two warnings(unused typedef, unused variable) and a typo	2016-12-01 21:23:43 +01:00
Gael Guennebaud	32b8da66e3	fix member order (grafted from `181138a1cb` )	2016-12-01 17:06:20 +01:00
Gael Guennebaud	eb94179ea3	Merged in sergiu/eigen/cmake-imported-target (pull request PR-257) CMake imported target (take #2)	2016-12-01 15:13:48 +00:00
Gael Guennebaud	52a7386aef	Fix misleading-indentation warnings. (grafted from `037b46762d` )	2016-12-01 16:05:42 +01:00
Gael Guennebaud	8cada1d894	Fix slection of product implementation for dynamic size matrices with fixed max size. (grafted from `8df272af88` )	2016-11-30 22:21:33 +01:00
Gael Guennebaud	6e4a664c42	Fix a performance regression in (matmat)vec for which mat*mat was evaluated multiple times. (grafted from `c927af60ed` )	2016-11-30 17:59:13 +01:00
Gael Guennebaud	1cd1a96d56	bug #1351 : fix compilation of random with old compilers (grafted from `ab4ef5e66e` )	2016-11-30 17:37:53 +01:00
Sergiu Deitsch	86ab00cdcf	cmake: remove architecture dependency from Eigen3ConfigVersion.cmake Also, install Eigen3*.cmake under $prefix/share/eigen3/cmake by default.	2016-11-30 15:46:46 +01:00
Sergiu Deitsch	65f09be8d2	doc: mention the NO_MODULE option and target availability	2016-11-30 15:41:38 +01:00
Gael Guennebaud	400d756b82	bug #1348 : Document EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES, and reflect in the doc that EIGEN_DONT_ALIGN* are deprecated. (grafted from `21d0286d81` )	2016-11-23 22:15:03 +01:00
Gael Guennebaud	9d31798a84	update cdash project for 3.3	2016-11-23 14:13:08 +01:00
Gael Guennebaud	723ed92e0e	Fix compilation with gcc and old ABI version (grafted from `e340866c81` )	2016-11-23 14:04:57 +01:00
Gael Guennebaud	0a7de0b273	Fix compilation issue with MSVC: MSVC always messes up with shadowed template arguments, for instance in: struct B { typedef float T; } template<typename T> struct A : B { T g; }; The type of A<double>::g will be float and not double. (grafted from `a91de27e98` )	2016-11-23 12:24:48 +01:00
Gael Guennebaud	d6b9bc1ccd	Optimize predux<Packet8f> (AVX) (grafted from `74637fa4e3` )	2016-11-22 21:57:52 +01:00
Gael Guennebaud	0eff51e2ed	Disable usage of SSE3 _mm_hadd_ps that is extremely slow. (grafted from `178c084856` )	2016-11-22 21:53:14 +01:00
Gael Guennebaud	1b7dd46d94	Optimize predux<Packet4d> (AVX) (grafted from `7dd894e40e` )	2016-11-22 21:41:30 +01:00
Gael Guennebaud	b2eb1bf3dc	Disable usage of SSE3 haddpd that is extremely slow. (grafted from `f3fb0a1940` )	2016-11-22 16:58:31 +01:00
Gael Guennebaud	fe48c25682	Revert vec/y to vec*(1/y) in row-major TRSM: - div is extremely costly - this is consistent with the column-major case - this is consistent with all other BLAS implementations (grafted from `eb621413c1` )	2016-12-06 15:04:50 +01:00
Gael Guennebaud	0ba6da3470	Fix BLAS backend for symmetric rank K updates. (grafted from `8365c2c941` )	2016-12-06 14:47:09 +01:00
Sergiu Deitsch	a287140f72	cmake: added Eigen3::Eigen imported target	2016-11-22 12:25:06 +01:00
Gael Guennebaud	4d89ec8a00	Fix regression in assigment of sparse block to spasre block. (grafted from `6a84246a6a` )	2016-11-21 21:46:42 +01:00
Chun Wang	441760f239	Workaround for error in VS2012 with /clr (grafted from `0d0948c3b9` )	2016-11-17 17:54:27 -05:00
Gael Guennebaud	664162fb8a	Fix compilation issue in mat = permutation (regression introduced in `8193ffb3d3` ) (grafted from `465ede0f20` )	2016-11-20 09:41:37 +01:00
Gael Guennebaud	aa3c761002	bug #1343 : fix compilation regression in mat+=selfadjoint_view. Generic EigenBase2EigenBase assignment was incomplete. (grafted from `8193ffb3d3` )	2016-11-18 10:17:34 +01:00
Gael Guennebaud	94f2cfc9c7	bug #1343 : fix compilation regression in array = matrix_product (grafted from `cebff7e3a2` )	2016-11-18 10:09:33 +01:00
Konstantinos Margaritis	4a13d79df6	replace sizeof(Packet) with PacketSize else it breaks for ZVector.Packet4f (grafted from `a1d5c503fa` )	2016-11-17 13:27:45 -05:00
Konstantinos Margaritis	463176cc44	implement float/std::complex<float> for ZVector as well, minor fixes to ZVector (grafted from `672aa97d4d` )	2016-11-17 13:27:33 -05:00
Gael Guennebaud	5aab97fba6	Optimize sparse<bool> && sparse<bool> to use the same path as for coeff-wise products. (grafted from `0ee92aa38e` )	2016-11-14 18:47:41 +01:00
Gael Guennebaud	89abc6806d	bug #426 : move operator && and \|\| to MatrixBase and SparseMatrixBase. (grafted from `2e334f5da0` )	2016-11-14 18:47:02 +01:00
Niels Ole Salscheider	baf793ebaa	Make sure not to call numext::maxi on expression templates (grafted from `51fef87408` )	2016-11-12 12:20:57 +01:00
Gael Guennebaud	b4ddafcfac	Fix regression in SparseMatrix::ReverseInnerIterator (grafted from `eedb87f4ba` )	2016-11-14 14:05:53 +01:00
Gael Guennebaud	1079967710	Added tag 3.3.0 for changeset `eeac81b8c0`	2016-11-10 13:57:29 +01:00
Gael Guennebaud	eeac81b8c0	bump to 3.3.0	2016-11-10 13:55:14 +01:00
Gael Guennebaud	e80bc2ddb0	Fix printing of sparse expressions	2016-11-10 10:35:32 +01:00
Benoit Steiner	db3903498d	Merged in benoitsteiner/opencl (pull request PR-246) Improved support for OpenCL	2016-11-08 22:28:44 +00:00
Benoit Steiner	dcc14bee64	Fixed the formatting of the code	2016-11-08 14:24:46 -08:00
Benoit Steiner	b88c1117d4	Fixed the indentation of the cmake file	2016-11-08 14:22:36 -08:00
Luke Iwanski	912cb3d660	#if EIGEN_EXCEPTION -> #ifdef EIGEN_EXCEPTIONS.	2016-11-08 22:01:14 +00:00
Luke Iwanski	1b345b0895	Fix for SYCL queue initialisation.	2016-11-08 21:56:31 +00:00
Luke Iwanski	1b95717358	Use try/catch only when exceptions are enabled.	2016-11-08 21:08:53 +00:00
Mehdi Goli	d57430dd73	Converting all sycl buffers to uninitialised device only buffers; adding memcpyHostToDevice and memcpyDeviceToHost on syclDevice; modifying all examples to obey the new rules; moving sycl queue creating to the device based on Benoit suggestion; removing the sycl specefic condition for returning m_result in TensorReduction.h according to Benoit suggestion.	2016-11-08 17:08:02 +00:00
Gael Guennebaud	73985ead27	Extend unit test to check sparse solvers with a SparseVector as the rhs and result.	2016-11-06 20:29:57 +01:00
Gael Guennebaud	436a111792	Generalize Cholmod support to hanlde any sparse type as the rhs and result of the solve method	2016-11-06 20:29:23 +01:00
Gael Guennebaud	afc55b1885	Generalize IterativeSolverBase::solve to hanlde any sparse type as the results (instead of SparseMatrix only)	2016-11-06 20:28:18 +01:00
Gael Guennebaud	a5c2d8a3cc	Generalize solve_sparse_through_dense_panels to handle SparseVector.	2016-11-06 15:20:58 +01:00
Gael Guennebaud	f8bfe10613	Add missing friend declaration	2016-11-06 15:20:30 +01:00
Gael Guennebaud	fc7180cda8	Add a default ctor to evaluator<SparseVector>. Needed for evaluator<Solve>.	2016-11-06 15:20:00 +01:00
Gael Guennebaud	4d226ab5b5	Enable swapping between SparseMatrix and SparseVector	2016-11-06 15:15:03 +01:00
Benoit Steiner	ad086b03e4	Removed unnecessary statement	2016-11-05 12:43:27 -07:00
Benoit Steiner	dad177be01	Added missing includes	2016-11-05 10:04:42 -07:00
Gael Guennebaud	55b4fd1d40	Extend mpreal unit test to check LLT with complexes.	2016-11-05 11:28:53 +01:00
Gael Guennebaud	a354c3ca59	Fix compilation of LLT with complex<mpreal>.	2016-11-05 11:28:29 +01:00
Benoit Steiner	d46a36cc84	Merged eigen/eigen into default	2016-11-04 18:22:55 -07:00
Mehdi Goli	0ebe3808ca	Removed the sycl include from Eigen/Core and moved it to Unsupported/Eigen/CXX11/Tensor; added TensorReduction for sycl (full reduction and partial reduction); added TensorReduction test case for sycl (full reduction and partial reduction); fixed the tile size on TensorSyclRun.h based on the device max work group size;	2016-11-04 18:18:19 +00:00
Gael Guennebaud	47d1b4a609	Added tag 3.3-rc2 for changeset `ba05572dcb`	2016-11-04 09:09:18 +01:00