Fix a previous mistake in moving the 2.0-beta5 in svn for some last-minute

fixes.
Remove this old file. It was stalling in history because of a bug in svn,
2026-04-10 11:34:33 +08:00 · 2009-05-16 17:23:42 +02:00 · 2009-05-16 17:20:44 +02:00 · 2009-05-16 14:05:54 +00:00 · 2009-05-15 16:05:45 +00:00 · 2009-05-15 15:54:52 +00:00
261 changed files with 16169 additions and 3080 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 project(Eigen)
-set(EIGEN_VERSION_NUMBER "2.0.2")
+set(EIGEN_VERSION_NUMBER "2.0.52-unstable")

 #if the svnversion program is absent, this will leave the SVN_REVISION string empty,
 #but won't stop CMake.
@@ -17,6 +17,8 @@ endif(EIGEN_SVN_REVISION)

 cmake_minimum_required(VERSION 2.6.2)

+include(CheckCXXCompilerFlag)
+
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)

 option(EIGEN_BUILD_TESTS "Build tests" OFF)
@@ -34,7 +36,13 @@ set(CMAKE_INCLUDE_CURRENT_DIR ON)

 if(CMAKE_COMPILER_IS_GNUCXX)
  if(CMAKE_SYSTEM_NAME MATCHES Linux)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wnon-virtual-dtor -Wno-long-long -ansi -Wundef -Wcast-align -Wchar-subscripts -Wall -W -Wpointer-arith -Wwrite-strings -Wformat-security -Wextra -fno-exceptions -fno-check-new -fno-common -fstrict-aliasing")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wnon-virtual-dtor -Wno-long-long -ansi -Wundef -Wcast-align -Wchar-subscripts -Wall -W -Wpointer-arith -Wwrite-strings -Wformat-security -fno-exceptions -fno-check-new -fno-common -fstrict-aliasing")
+
+    check_cxx_compiler_flag("-Wextra" COMPILER_SUPPORT_WEXTRA)
+    if(COMPILER_SUPPORT_WEXTRA)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wextra")
+    endif()
+
    if(NOT EIGEN_TEST_LIB)
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pedantic")
    endif(NOT EIGEN_TEST_LIB)
@@ -84,14 +92,22 @@ endif(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION)

 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})

+set(INCLUDE_INSTALL_DIR
+    "${CMAKE_INSTALL_PREFIX}/include/eigen2"
+    CACHE PATH
+    "The directory where we install the header files"
+    FORCE)
+
 add_subdirectory(Eigen)

+add_subdirectory(doc)
+
 if(EIGEN_BUILD_TESTS)
  include(CTest)
  add_subdirectory(test)
 endif(EIGEN_BUILD_TESTS)

-add_subdirectory(doc)
+add_subdirectory(unsupported)

 if(EIGEN_BUILD_DEMOS)
  add_subdirectory(demos)
@@ -100,3 +116,7 @@ endif(EIGEN_BUILD_DEMOS)
 if(EIGEN_BUILD_BTL)
  add_subdirectory(bench/btl)
 endif(EIGEN_BUILD_BTL)
+
+if(EIGEN_BUILD_TESTS)
+  ei_testing_print_summary()
+endif(EIGEN_BUILD_TESTS)
--- a/Eigen/Array
+++ b/Eigen/Array
@@ -16,8 +16,12 @@ namespace Eigen {
  *  - sin, cos, sqrt, pow, exp, log, square, cube, inverse (reciprocal).
  *
  * This module also provides various MatrixBase methods, including:
-  *  - \ref MatrixBase::all() "all", \ref MatrixBase::any() "any",
-  *  - \ref MatrixBase::Random() "random matrix initialization"
+  *  - boolean reductions: \ref MatrixBase::all() "all", \ref MatrixBase::any() "any", \ref MatrixBase::count() "count",
+  *  - \ref MatrixBase::Random() "random matrix initialization",
+  *  - a \ref MatrixBase::select() "select" function mimicking the trivariate ?: operator,
+  *  - \ref MatrixBase::colwise() "column-wise" and \ref MatrixBase::rowwise() "row-wise" reductions,
+  *  - \ref MatrixBase::reverse() "matrix reverse",
+  *  - \ref MatrixBase::lpNorm() "generic matrix norm".
  *
  * \code
  * #include <Eigen/Array>
@@ -31,6 +35,8 @@ namespace Eigen {
 #include "src/Array/PartialRedux.h"
 #include "src/Array/Random.h"
 #include "src/Array/Norms.h"
+#include "src/Array/Replicate.h"
+#include "src/Array/Reverse.h"

 } // namespace Eigen

--- a/Eigen/CMakeLists.txt
+++ b/Eigen/CMakeLists.txt
@@ -20,15 +20,9 @@ if(CMAKE_COMPILER_IS_GNUCXX)
    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -g1 -O2")
 endif(CMAKE_COMPILER_IS_GNUCXX)

-set(INCLUDE_INSTALL_DIR
-    "${CMAKE_INSTALL_PREFIX}/include/eigen2"
-    CACHE PATH
-    "The directory where we install the header files"
-    FORCE)
-
 install(FILES
  ${Eigen_HEADERS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen COMPONENT Devel
  )

 add_subdirectory(src)
--- a/Eigen/Cholesky
+++ b/Eigen/Cholesky
@@ -38,8 +38,8 @@ namespace Eigen {
 } // namespace Eigen

 #define EIGEN_CHOLESKY_MODULE_INSTANTIATE_TYPE(MATRIXTYPE,PREFIX) \
-  PREFIX template class Cholesky<MATRIXTYPE>; \
-  PREFIX template class CholeskyWithoutSquareRoot<MATRIXTYPE>
+  PREFIX template class LLT<MATRIXTYPE>; \
+  PREFIX template class LDLT<MATRIXTYPE>

 #define EIGEN_CHOLESKY_MODULE_INSTANTIATE(PREFIX) \
  EIGEN_CHOLESKY_MODULE_INSTANTIATE_TYPE(Matrix2f,PREFIX); \
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -1,3 +1,28 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
 #ifndef EIGEN_CORE_H
 #define EIGEN_CORE_H

@@ -11,7 +36,7 @@
    // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || defined(_M_X64)
      #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
-    #endif   
+    #endif
  #endif
 #endif

@@ -26,6 +51,10 @@
  #define EIGEN_SSE2_BUT_NOT_OLD_GCC
 #endif

+#ifdef EIGEN_DONT_ALIGN
+  #define EIGEN_DONT_VECTORIZE
+#endif
+
 #ifndef EIGEN_DONT_VECTORIZE
  #if defined (EIGEN_SSE2_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
    #define EIGEN_VECTORIZE
@@ -101,6 +130,7 @@ namespace Eigen {

 #if defined EIGEN_VECTORIZE_SSE
  #include "src/Core/arch/SSE/PacketMath.h"
+  #include "src/Core/arch/SSE/MathFunctions.h"
 #elif defined EIGEN_VECTORIZE_ALTIVEC
  #include "src/Core/arch/AltiVec/PacketMath.h"
 #endif
@@ -120,6 +150,7 @@ namespace Eigen {

 #include "src/Core/MatrixStorage.h"
 #include "src/Core/NestByValue.h"
+#include "src/Core/ReturnByValue.h"
 #include "src/Core/Flagged.h"
 #include "src/Core/Matrix.h"
 #include "src/Core/Cwise.h"
@@ -127,7 +158,6 @@ namespace Eigen {
 #include "src/Core/CwiseUnaryOp.h"
 #include "src/Core/CwiseNullaryOp.h"
 #include "src/Core/Dot.h"
-#include "src/Core/Product.h"
 #include "src/Core/DiagonalProduct.h"
 #include "src/Core/SolveTriangular.h"
 #include "src/Core/MapBase.h"
@@ -136,8 +166,7 @@ namespace Eigen {
 #include "src/Core/Minor.h"
 #include "src/Core/Transpose.h"
 #include "src/Core/DiagonalMatrix.h"
-#include "src/Core/DiagonalCoeffs.h"
-#include "src/Core/Sum.h"
+#include "src/Core/Diagonal.h"
 #include "src/Core/Redux.h"
 #include "src/Core/Visitor.h"
 #include "src/Core/Fuzzy.h"
@@ -145,7 +174,10 @@ namespace Eigen {
 #include "src/Core/Swap.h"
 #include "src/Core/CommaInitializer.h"
 #include "src/Core/Part.h"
-#include "src/Core/CacheFriendlyProduct.h"
+#include "src/Core/Product.h"
+#include "src/Core/products/GeneralMatrixMatrix.h"
+#include "src/Core/products/GeneralMatrixVector.h"
+#include "src/Core/products/SelfadjointMatrixVector.h"

 } // namespace Eigen

--- a/Eigen/Geometry
+++ b/Eigen/Geometry
@@ -32,6 +32,7 @@ namespace Eigen {
  */

 #include "src/Geometry/OrthoMethods.h"
+#include "src/Geometry/Homogeneous.h"
 #include "src/Geometry/RotationBase.h"
 #include "src/Geometry/Rotation2D.h"
 #include "src/Geometry/Quaternion.h"
@@ -44,6 +45,10 @@ namespace Eigen {
 #include "src/Geometry/ParametrizedLine.h"
 #include "src/Geometry/AlignedBox.h"

+#if defined EIGEN_VECTORIZE_SSE
+  #include "src/Geometry/arch/Geometry_SSE.h"
+#endif
+
 } // namespace Eigen

 #include "src/Core/util/EnableMSVCWarnings.h"
--- a/Eigen/LU
+++ b/Eigen/LU
@@ -19,6 +19,7 @@ namespace Eigen {
  */

 #include "src/LU/LU.h"
+#include "src/LU/PartialLU.h"
 #include "src/LU/Determinant.h"
 #include "src/LU/Inverse.h"

--- a/Eigen/Sparse
+++ b/Eigen/Sparse
@@ -22,7 +22,6 @@
 #endif

 #ifdef EIGEN_TAUCS_SUPPORT
-
  // taucs.h declares a lot of mess
  #define isnan
  #define finite
@@ -40,26 +39,28 @@
  #ifdef max
    #undef max
  #endif
-
+  #ifdef complex
+    #undef complex
+  #endif
 #endif

 #ifdef EIGEN_SUPERLU_SUPPORT
  typedef int int_t;
-  #include "superlu/slu_Cnames.h"
-  #include "superlu/supermatrix.h"
-  #include "superlu/slu_util.h"
+  #include "slu_Cnames.h"
+  #include "supermatrix.h"
+  #include "slu_util.h"

  namespace SuperLU_S {
-  #include "superlu/slu_sdefs.h"
+  #include "slu_sdefs.h"
  }
  namespace SuperLU_D {
-  #include "superlu/slu_ddefs.h"
+  #include "slu_ddefs.h"
  }
  namespace SuperLU_C {
-  #include "superlu/slu_cdefs.h"
+  #include "slu_cdefs.h"
  }
  namespace SuperLU_Z {
-  #include "superlu/slu_zdefs.h"
+  #include "slu_zdefs.h"
  }
  namespace Eigen { struct SluMatrix; }
 #endif
@@ -102,6 +103,8 @@ namespace Eigen {
 #include "src/Sparse/SparseFuzzy.h"
 #include "src/Sparse/SparseFlagged.h"
 #include "src/Sparse/SparseProduct.h"
+#include "src/Sparse/SparseDiagonalProduct.h"
+#include "src/Sparse/SparseTriangular.h"
 #include "src/Sparse/TriangularSolver.h"
 #include "src/Sparse/SparseLLT.h"
 #include "src/Sparse/SparseLDLT.h"
--- a/Eigen/StdVector
+++ b/Eigen/StdVector
@@ -1,49 +1,71 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2009 Hauke Heibel <hauke.heibel@googlemail.com>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
 #ifndef EIGEN_STDVECTOR_MODULE_H
 #define EIGEN_STDVECTOR_MODULE_H

-#if defined(_GLIBCXX_VECTOR) || defined(_VECTOR_)
-#error you must include Eigen/StdVector before std::vector
-#endif                                                    
-
-#ifndef EIGEN_GNUC_AT_LEAST
-#ifdef __GNUC__
-  #define EIGEN_GNUC_AT_LEAST(x,y) ((__GNUC__>=x && __GNUC_MINOR__>=y) || __GNUC__>x)
-#else
-  #define EIGEN_GNUC_AT_LEAST(x,y) 0
-#endif
-#endif
-
-#define vector std_vector
-#include <vector>        
-#undef vector
+#include "Core"
+#include <vector>

 namespace Eigen {

-template<typename T> class aligned_allocator;
+// This one is needed to prevent reimplementing the whole std::vector.
+template <class T>
+class aligned_allocator_indirection : public aligned_allocator<T>
+{
+public:
+  typedef size_t    size_type;
+  typedef ptrdiff_t difference_type;
+  typedef T*        pointer;
+  typedef const T*  const_pointer;
+  typedef T&        reference;
+  typedef const T&  const_reference;
+  typedef T         value_type;

-// meta programming to determine if a class has a given member
-struct ei_does_not_have_aligned_operator_new_marker_sizeof {int a[1];};
-struct ei_has_aligned_operator_new_marker_sizeof {int a[2];};
+  template<class U>
+  struct rebind
+  {
+    typedef aligned_allocator_indirection<U> other;
+  };

-template<typename ClassType>
-struct ei_has_aligned_operator_new {
-    template<typename T>
-    static ei_has_aligned_operator_new_marker_sizeof
-    test(T const *, typename T::ei_operator_new_marker_type const * = 0);
-    static ei_does_not_have_aligned_operator_new_marker_sizeof
-    test(...);
-
-    // note that the following indirection is needed for gcc-3.3
-    enum {ret =  sizeof(test(static_cast<ClassType*>(0))) 
-              == sizeof(ei_has_aligned_operator_new_marker_sizeof) };
+  aligned_allocator_indirection() throw() {}
+  aligned_allocator_indirection(const aligned_allocator_indirection& ) throw() : aligned_allocator<T>() {}
+  aligned_allocator_indirection(const aligned_allocator<T>& ) throw() {}
+  template<class U>
+  aligned_allocator_indirection(const aligned_allocator_indirection<U>& ) throw() {}
+  template<class U>
+  aligned_allocator_indirection(const aligned_allocator<U>& ) throw() {}
+  ~aligned_allocator_indirection() throw() {}
 };

 #ifdef _MSC_VER
-  
+
  // sometimes, MSVC detects, at compile time, that the argument x
  // in std::vector::resize(size_t s,T x) won't be aligned and generate an error
  // even if this function is never called. Whence this little wrapper.
-  #define _EIGEN_WORKAROUND_MSVC_STD_VECTOR(T) Eigen::ei_workaround_msvc_std_vector<T>
+  #define EIGEN_WORKAROUND_MSVC_STD_VECTOR(T) Eigen::ei_workaround_msvc_std_vector<T>
  template<typename T> struct ei_workaround_msvc_std_vector : public T
  {
    inline ei_workaround_msvc_std_vector() : T() {}
@@ -59,7 +81,7 @@ struct ei_has_aligned_operator_new {

 #else

-  #define _EIGEN_WORKAROUND_MSVC_STD_VECTOR(T) T
+  #define EIGEN_WORKAROUND_MSVC_STD_VECTOR(T) T

 #endif

@@ -73,59 +95,71 @@ namespace std {
    typedef typename vector_base::allocator_type allocator_type; \
    typedef typename vector_base::size_type size_type;  \
    typedef typename vector_base::iterator iterator;  \
-    explicit vector(const allocator_type& __a = allocator_type()) : vector_base(__a) {}  \
+    typedef typename vector_base::const_iterator const_iterator;  \
+    explicit vector(const allocator_type& a = allocator_type()) : vector_base(a) {}  \
+    template<typename InputIterator> \
+    vector(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) \
+    : vector_base(first, last, a) {} \
    vector(const vector& c) : vector_base(c) {}  \
-    vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \
+    explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \
    vector(iterator start, iterator end) : vector_base(start, end) {}  \
-    vector& operator=(const vector& __x) {  \
-      vector_base::operator=(__x);  \
+    vector& operator=(const vector& x) {  \
+      vector_base::operator=(x);  \
      return *this;  \
    }

-template<typename T,
-         typename AllocT = std::allocator<T>,
-         bool HasAlignedNew = Eigen::ei_has_aligned_operator_new<T>::ret>
-class vector : public std::std_vector<T,AllocT>
+template<typename T>
+class vector<T,Eigen::aligned_allocator<T> >
+  : public vector<EIGEN_WORKAROUND_MSVC_STD_VECTOR(T),
+                  Eigen::aligned_allocator_indirection<EIGEN_WORKAROUND_MSVC_STD_VECTOR(T)> >
 {
-  typedef std_vector<T, AllocT> vector_base;
-  EIGEN_STD_VECTOR_SPECIALIZATION_BODY
-};
-
-template<typename T,typename DummyAlloc>
-class vector<T,DummyAlloc,true>
-  : public std::std_vector<_EIGEN_WORKAROUND_MSVC_STD_VECTOR(T),
-                           Eigen::aligned_allocator<_EIGEN_WORKAROUND_MSVC_STD_VECTOR(T)> >          
-{
-  typedef std_vector<_EIGEN_WORKAROUND_MSVC_STD_VECTOR(T),
-                     Eigen::aligned_allocator<_EIGEN_WORKAROUND_MSVC_STD_VECTOR(T)> > vector_base;
+  typedef vector<EIGEN_WORKAROUND_MSVC_STD_VECTOR(T),
+                 Eigen::aligned_allocator_indirection<EIGEN_WORKAROUND_MSVC_STD_VECTOR(T)> > vector_base;
  EIGEN_STD_VECTOR_SPECIALIZATION_BODY

-  void resize(size_type __new_size)
-  { resize(__new_size, T()); }     
+  void resize(size_type new_size)
+  { resize(new_size, T()); }

-  #if defined(_VECTOR_)
+#if defined(_VECTOR_)
  // workaround MSVC std::vector implementation
-  void resize(size_type __new_size, const value_type& __x)                 
-  {                                                              
-    if (vector_base::size() < __new_size)                                 
-      vector_base::_Insert_n(vector_base::end(), __new_size - vector_base::size(), __x);
-    else if (__new_size < vector_base::size())
-      vector_base::erase(vector_base::begin() + __new_size, vector_base::end());
+  void resize(size_type new_size, const value_type& x)
+  {
+    if (vector_base::size() < new_size)
+      vector_base::_Insert_n(vector_base::end(), new_size - vector_base::size(), x);
+    else if (new_size < vector_base::size())
+      vector_base::erase(vector_base::begin() + new_size, vector_base::end());
  }
-  #elif defined(_GLIBCXX_VECTOR) && EIGEN_GNUC_AT_LEAST(4,1)
+  void push_back(const value_type& x)
+  { vector_base::push_back(x); } 
+  using vector_base::insert;  
+  iterator insert(const_iterator position, const value_type& x)
+  { return vector_base::insert(position,x); }
+  void insert(const_iterator position, size_type new_size, const value_type& x)
+  { vector_base::insert(position, new_size, x); }
+#elif defined(_GLIBCXX_VECTOR) && EIGEN_GNUC_AT_LEAST(4,1)
  // workaround GCC std::vector implementation
  // Note that before gcc-4.1 we already have: std::vector::resize(size_type,const T&),
  // no no need to workaround !
-  void resize(size_type __new_size, const value_type& __x)
-  {                                              
-    if (__new_size < vector_base::size())               
-      vector_base::_M_erase_at_end(this->_M_impl._M_start + __new_size);
-    else                       
-      vector_base::insert(vector_base::end(), __new_size - vector_base::size(), __x); 
-  }                                                              
-  #else
+  void resize(size_type new_size, const value_type& x)
+  {
+    if (new_size < vector_base::size())
+      vector_base::_M_erase_at_end(this->_M_impl._M_start + new_size);
+    else
+      vector_base::insert(vector_base::end(), new_size - vector_base::size(), x);
+  }
+#elif defined(_GLIBCXX_VECTOR)
  using vector_base::resize;
-  #endif  
+#else
+  // default implementation which should always work.
+  void resize(size_type new_size, const value_type& x)
+  {
+    if (new_size < vector_base::size())
+      vector_base::erase(vector_base::begin() + new_size, vector_base::end());
+    else if (new_size > vector_base::size())
+      vector_base::insert(vector_base::end(), new_size - vector_base::size(), x);
+  }
+#endif
+
 };

 }
--- a/Eigen/src/Array/CMakeLists.txt
+++ b/Eigen/src/Array/CMakeLists.txt
@@ -2,5 +2,5 @@ FILE(GLOB Eigen_Array_SRCS "*.h")

 INSTALL(FILES
  ${Eigen_Array_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Array
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Array COMPONENT Devel
  )
--- a/Eigen/src/Array/CwiseOperators.h
+++ b/Eigen/src/Array/CwiseOperators.h
@@ -82,7 +82,7 @@ Cwise<ExpressionType>::log() const
  * Example: \include Cwise_cos.cpp
  * Output: \verbinclude Cwise_cos.out
  *
-  * \sa sin(), exp()
+  * \sa sin(), exp(), EIGEN_FAST_MATH
  */
 template<typename ExpressionType>
 inline const EIGEN_CWISE_UNOP_RETURN_TYPE(ei_scalar_cos_op)
@@ -99,7 +99,7 @@ Cwise<ExpressionType>::cos() const
  * Example: \include Cwise_sin.cpp
  * Output: \verbinclude Cwise_sin.out
  *
-  * \sa cos(), exp()
+  * \sa cos(), exp(), EIGEN_FAST_MATH
  */
 template<typename ExpressionType>
 inline const EIGEN_CWISE_UNOP_RETURN_TYPE(ei_scalar_sin_op)
--- a/Eigen/src/Array/Functors.h
+++ b/Eigen/src/Array/Functors.h
@@ -58,10 +58,16 @@ struct ei_functor_traits<ei_scalar_add_op<Scalar> >
  */
 template<typename Scalar> struct ei_scalar_sqrt_op EIGEN_EMPTY_STRUCT {
  inline const Scalar operator() (const Scalar& a) const { return ei_sqrt(a); }
+  typedef typename ei_packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return ei_psqrt(a); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_sqrt_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
+{ enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = ei_packet_traits<Scalar>::HasSqrt
+  };
+};

 /** \internal
  *
@@ -73,10 +79,12 @@ struct ei_functor_traits<ei_scalar_sqrt_op<Scalar> >
  */
 template<typename Scalar> struct ei_scalar_exp_op EIGEN_EMPTY_STRUCT {
  inline const Scalar operator() (const Scalar& a) const { return ei_exp(a); }
+  typedef typename ei_packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return ei_pexp(a); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_exp_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = ei_packet_traits<Scalar>::HasExp }; };

 /** \internal
  *
@@ -88,10 +96,12 @@ struct ei_functor_traits<ei_scalar_exp_op<Scalar> >
  */
 template<typename Scalar> struct ei_scalar_log_op EIGEN_EMPTY_STRUCT {
  inline const Scalar operator() (const Scalar& a) const { return ei_log(a); }
+  typedef typename ei_packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return ei_plog(a); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_log_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = ei_packet_traits<Scalar>::HasLog }; };

 /** \internal
  *
@@ -102,11 +112,18 @@ struct ei_functor_traits<ei_scalar_log_op<Scalar> >
  * \sa class CwiseUnaryOp, Cwise::cos()
  */
 template<typename Scalar> struct ei_scalar_cos_op EIGEN_EMPTY_STRUCT {
-  inline const Scalar operator() (const Scalar& a) const { return ei_cos(a); }
+  inline Scalar operator() (const Scalar& a) const { return ei_cos(a); }
+  typedef typename ei_packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return ei_pcos(a); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_cos_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = ei_packet_traits<Scalar>::HasCos && EIGEN_FAST_MATH
+  };
+};

 /** \internal
  *
@@ -118,10 +135,17 @@ struct ei_functor_traits<ei_scalar_cos_op<Scalar> >
  */
 template<typename Scalar> struct ei_scalar_sin_op EIGEN_EMPTY_STRUCT {
  inline const Scalar operator() (const Scalar& a) const { return ei_sin(a); }
+  typedef typename ei_packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return ei_psin(a); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_sin_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = ei_packet_traits<Scalar>::HasSin && EIGEN_FAST_MATH
+  };
+};

 /** \internal
  *
--- a/Eigen/src/Array/PartialRedux.h
+++ b/Eigen/src/Array/PartialRedux.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra. Eigen itself is part of the KDE project.
 //
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // Eigen is free software; you can redistribute it and/or
@@ -26,7 +26,7 @@
 #ifndef EIGEN_PARTIAL_REDUX_H
 #define EIGEN_PARTIAL_REDUX_H

-/** \array_module \ingroup Array
+/** \array_module \ingroup Array_Module
  *
  * \class PartialReduxExpr
  *
@@ -119,6 +119,8 @@ EIGEN_MEMBER_FUNCTOR(maxCoeff, (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(all, (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(any, (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(count, (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(prod, (Size-1)*NumTraits<Scalar>::MulCost);
+

 /** \internal */
 template <typename BinaryOp, typename Scalar>
@@ -135,7 +137,7 @@ struct ei_member_redux {
  const BinaryOp m_functor;
 };

-/** \array_module \ingroup Array
+/** \array_module \ingroup Array_Module
  *
  * \class PartialRedux
  *
@@ -177,8 +179,6 @@ template<typename ExpressionType, int Direction> class PartialRedux
                              > Type;
    };

-    typedef typename ExpressionType::PlainMatrixType CrossReturnType;
-    
    inline PartialRedux(const ExpressionType& matrix) : m_matrix(matrix) {}

    /** \internal */
@@ -251,7 +251,7 @@ template<typename ExpressionType, int Direction> class PartialRedux
      * \sa MatrixBase::any() */
    const typename ReturnType<ei_member_any>::Type any() const
    { return _expression(); }
-    
+
    /** \returns a row (or column) vector expression representing
      * the number of \c true coefficients of each respective column (or row).
      *
@@ -262,32 +262,68 @@ template<typename ExpressionType, int Direction> class PartialRedux
    const PartialReduxExpr<ExpressionType, ei_member_count<int>, Direction> count() const
    { return _expression(); }

-    /** \returns a 3x3 matrix expression of the cross product
-      * of each column or row of the referenced expression with the \a other vector.
+    /** \returns a row (or column) vector expression of the product
+      * of each column (or row) of the referenced expression.
      *
-      * \geometry_module
+      * Example: \include PartialRedux_prod.cpp
+      * Output: \verbinclude PartialRedux_prod.out
      *
-      * \sa MatrixBase::cross() */
-    template<typename OtherDerived>
-    const CrossReturnType cross(const MatrixBase<OtherDerived>& other) const
-    {
-      EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(CrossReturnType,3,3)
-      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,3)
-      EIGEN_STATIC_ASSERT((ei_is_same_type<Scalar, typename OtherDerived::Scalar>::ret),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+      * \sa MatrixBase::prod() */
+    const typename ReturnType<ei_member_prod>::Type prod() const
+    { return _expression(); }

-      if(Direction==Vertical)
-        return (CrossReturnType()
-                                 << _expression().col(0).cross(other),
-                                    _expression().col(1).cross(other),
-                                    _expression().col(2).cross(other)).finished();
-      else
-        return (CrossReturnType() 
-                                 << _expression().row(0).cross(other),
-                                    _expression().row(1).cross(other),
-                                    _expression().row(2).cross(other)).finished();
+
+    /** \returns a matrix expression
+      * where each column (or row) are reversed.
+      *
+      * Example: \include PartialRedux_reverse.cpp
+      * Output: \verbinclude PartialRedux_reverse.out
+      *
+      * \sa MatrixBase::reverse() */
+    const Reverse<ExpressionType, Direction> reverse() const
+    {
+      return Reverse<ExpressionType, Direction>( _expression() );
    }

+    const Replicate<ExpressionType,Direction==Vertical?Dynamic:1,Direction==Horizontal?Dynamic:1>
+    replicate(int factor) const;
+
+    template<int Factor>
+    const Replicate<ExpressionType,(Direction==Vertical?Factor:1),(Direction==Horizontal?Factor:1)>
+    replicate(int factor = Factor) const;
+
+/////////// Geometry module ///////////
+
+    const Homogeneous<ExpressionType,Direction> homogeneous() const;
+
+    typedef typename ExpressionType::PlainMatrixType CrossReturnType;
+    template<typename OtherDerived>
+    const CrossReturnType cross(const MatrixBase<OtherDerived>& other) const;
+
+    enum {
+      HNormalized_Size = Direction==Vertical ? ei_traits<ExpressionType>::RowsAtCompileTime
+                                             : ei_traits<ExpressionType>::ColsAtCompileTime,
+      HNormalized_SizeMinusOne = HNormalized_Size==Dynamic ? Dynamic : HNormalized_Size-1
+    };
+    typedef Block<ExpressionType,
+                  Direction==Vertical   ? int(HNormalized_SizeMinusOne)
+                                        : int(ei_traits<ExpressionType>::RowsAtCompileTime),
+                  Direction==Horizontal ? int(HNormalized_SizeMinusOne)
+                                        : int(ei_traits<ExpressionType>::ColsAtCompileTime)>
+            HNormalized_Block;
+    typedef Block<ExpressionType,
+                  Direction==Vertical   ? 1 : int(ei_traits<ExpressionType>::RowsAtCompileTime),
+                  Direction==Horizontal ? 1 : int(ei_traits<ExpressionType>::ColsAtCompileTime)>
+            HNormalized_Factors;
+    typedef CwiseBinaryOp<ei_scalar_quotient_op<typename ei_traits<ExpressionType>::Scalar>,
+                NestByValue<HNormalized_Block>,
+                NestByValue<Replicate<NestByValue<HNormalized_Factors>,
+                  Direction==Vertical   ? HNormalized_SizeMinusOne : 1,
+                  Direction==Horizontal ? HNormalized_SizeMinusOne : 1> > >
+            HNormalizedReturnType;
+
+    const HNormalizedReturnType hnormalized() const;
+
  protected:
    ExpressionTypeNested m_matrix;
 };
--- a/Eigen/src/Array/Replicate.h
+++ b/Eigen/src/Array/Replicate.h
@@ -0,0 +1,160 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_REPLICATE_H
+#define EIGEN_REPLICATE_H
+
+/** \nonstableyet
+  * \class Replicate
+  *
+  * \brief Expression of the multiple replication of a matrix or vector
+  *
+  * \param MatrixType the type of the object we are replicating
+  *
+  * This class represents an expression of the multiple replication of a matrix or vector.
+  * It is the return type of MatrixBase::replicate() and most of the time
+  * this is the only way it is used.
+  *
+  * \sa MatrixBase::replicate()
+  */
+template<typename MatrixType,int RowFactor,int ColFactor>
+struct ei_traits<Replicate<MatrixType,RowFactor,ColFactor> >
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename ei_nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ei_unref<MatrixTypeNested>::type _MatrixTypeNested;
+  enum {
+    RowsPlusOne = (MatrixType::RowsAtCompileTime != Dynamic) ?
+                  int(MatrixType::RowsAtCompileTime) + 1 : Dynamic,
+    ColsPlusOne = (MatrixType::ColsAtCompileTime != Dynamic) ?
+                  int(MatrixType::ColsAtCompileTime) + 1 : Dynamic,
+    RowsAtCompileTime = RowFactor==Dynamic || MatrixType::RowsAtCompileTime==Dynamic
+                      ? Dynamic
+                      : RowFactor * MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = ColFactor==Dynamic || MatrixType::ColsAtCompileTime==Dynamic
+                      ? Dynamic
+                      : ColFactor * MatrixType::ColsAtCompileTime,
+    MaxRowsAtCompileTime = RowsAtCompileTime,
+    MaxColsAtCompileTime = ColsAtCompileTime,
+    Flags = _MatrixTypeNested::Flags & HereditaryBits,
+    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+  };
+};
+
+template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
+  : public MatrixBase<Replicate<MatrixType,RowFactor,ColFactor> >
+{
+  public:
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(Replicate)
+
+    inline Replicate(const MatrixType& matrix)
+      : m_matrix(matrix), m_rowFactor(RowFactor), m_colFactor(ColFactor)
+    {
+      ei_assert(RowFactor!=Dynamic && ColFactor!=Dynamic);
+    }
+
+    inline Replicate(const MatrixType& matrix, int rowFactor, int colFactor)
+      : m_matrix(matrix), m_rowFactor(rowFactor), m_colFactor(colFactor)
+    {}
+
+    inline int rows() const { return m_matrix.rows() * m_rowFactor.value(); }
+    inline int cols() const { return m_matrix.cols() * m_colFactor.value(); }
+
+    inline Scalar coeff(int row, int col) const
+    {
+      return m_matrix.coeff(row%m_matrix.rows(), col%m_matrix.cols());
+    }
+
+  protected:
+    const typename MatrixType::Nested m_matrix;
+    const ei_int_if_dynamic<RowFactor> m_rowFactor;
+    const ei_int_if_dynamic<ColFactor> m_colFactor;
+};
+
+/** \nonstableyet
+  * \return an expression of the replication of \c *this
+  *
+  * Example: \include MatrixBase_replicate.cpp
+  * Output: \verbinclude MatrixBase_replicate.out
+  *
+  * \sa PartialRedux::replicate(), MatrixBase::replicate(int,int), class Replicate
+  */
+template<typename Derived>
+template<int RowFactor, int ColFactor>
+inline const Replicate<Derived,RowFactor,ColFactor>
+MatrixBase<Derived>::replicate() const
+{
+  return derived();
+}
+
+/** \nonstableyet
+  * \return an expression of the replication of \c *this
+  *
+  * Example: \include MatrixBase_replicate_int_int.cpp
+  * Output: \verbinclude MatrixBase_replicate_int_int.out
+  *
+  * \sa PartialRedux::replicate(), MatrixBase::replicate<int,int>(), class Replicate
+  */
+template<typename Derived>
+inline const Replicate<Derived,Dynamic,Dynamic>
+MatrixBase<Derived>::replicate(int rowFactor,int colFactor) const
+{
+  return Replicate<Derived,Dynamic,Dynamic>(derived(),rowFactor,colFactor);
+}
+
+/** \nonstableyet
+  * \return an expression of the replication of each column (or row) of \c *this
+  *
+  * Example: \include DirectionWise_replicate_int.cpp
+  * Output: \verbinclude DirectionWise_replicate_int.out
+  *
+  * \sa PartialRedux::replicate(), MatrixBase::replicate(), class Replicate
+  */
+template<typename ExpressionType, int Direction>
+const Replicate<ExpressionType,(Direction==Vertical?Dynamic:1),(Direction==Horizontal?Dynamic:1)>
+PartialRedux<ExpressionType,Direction>::replicate(int factor) const
+{
+  return Replicate<ExpressionType,Direction==Vertical?Dynamic:1,Direction==Horizontal?Dynamic:1>
+          (_expression(),Direction==Vertical?factor:1,Direction==Horizontal?factor:1);
+}
+
+/** \nonstableyet
+  * \return an expression of the replication of each column (or row) of \c *this
+  *
+  * Example: \include DirectionWise_replicate.cpp
+  * Output: \verbinclude DirectionWise_replicate.out
+  *
+  * \sa PartialRedux::replicate(int), MatrixBase::replicate(), class Replicate
+  */
+template<typename ExpressionType, int Direction>
+template<int Factor>
+const Replicate<ExpressionType,(Direction==Vertical?Factor:1),(Direction==Horizontal?Factor:1)>
+PartialRedux<ExpressionType,Direction>::replicate(int factor) const
+{
+  return Replicate<ExpressionType,Direction==Vertical?Factor:1,Direction==Horizontal?Factor:1>
+          (_expression(),Direction==Vertical?factor:1,Direction==Horizontal?factor:1);
+}
+
+#endif // EIGEN_REPLICATE_H
--- a/Eigen/src/Array/Reverse.h
+++ b/Eigen/src/Array/Reverse.h
@@ -0,0 +1,202 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2009 Ricard Marxer <email@ricardmarxer.com>
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_REVERSE_H
+#define EIGEN_REVERSE_H
+
+/** \array_module \ingroup Array_Module
+  *
+  * \class Reverse
+  *
+  * \brief Expression of the reverse of a vector or matrix
+  *
+  * \param MatrixType the type of the object of which we are taking the reverse
+  *
+  * This class represents an expression of the reverse of a vector.
+  * It is the return type of MatrixBase::reverse() and PartialRedux::reverse()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::reverse(), PartialRedux::reverse()
+  */
+template<typename MatrixType, int Direction>
+struct ei_traits<Reverse<MatrixType, Direction> >
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename ei_nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ei_unref<MatrixTypeNested>::type _MatrixTypeNested;
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
+
+    // let's enable LinearAccess only with vectorization because of the product overhead
+    LinearAccess = ( (Direction==BothDirections) && (int(_MatrixTypeNested::Flags)&PacketAccessBit) )
+                 ? LinearAccessBit : 0,
+
+    Flags = (int(_MatrixTypeNested::Flags) & (HereditaryBits | PacketAccessBit | LinearAccess))
+          | (int(_MatrixTypeNested::Flags)&UpperTriangularBit ? LowerTriangularBit : 0)
+          | (int(_MatrixTypeNested::Flags)&LowerTriangularBit ? UpperTriangularBit : 0),
+
+    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+  };
+};
+
+template<typename PacketScalar, bool ReversePacket> struct ei_reverse_packet_cond
+{
+  static inline PacketScalar run(const PacketScalar& x) { return ei_preverse(x); }
+};
+template<typename PacketScalar> struct ei_reverse_packet_cond<PacketScalar,false>
+{
+  static inline PacketScalar run(const PacketScalar& x) { return x; }
+};
+
+template<typename MatrixType, int Direction> class Reverse
+  : public MatrixBase<Reverse<MatrixType, Direction> >
+{
+  public:
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(Reverse)
+
+  protected:
+    enum {
+      PacketSize = ei_packet_traits<Scalar>::size,
+      IsRowMajor = Flags & RowMajorBit,
+      IsColMajor = !IsRowMajor,
+      ReverseRow = (Direction == Vertical)   || (Direction == BothDirections),
+      ReverseCol = (Direction == Horizontal) || (Direction == BothDirections),
+      OffsetRow  = ReverseRow && IsColMajor ? PacketSize : 1,
+      OffsetCol  = ReverseCol && IsRowMajor ? PacketSize : 1,
+      ReversePacket = (Direction == BothDirections)
+                    || ((Direction == Vertical)   && IsColMajor)
+                    || ((Direction == Horizontal) && IsRowMajor)
+    };
+    typedef ei_reverse_packet_cond<PacketScalar,ReversePacket> reverse_packet;
+  public:
+
+    inline Reverse(const MatrixType& matrix) : m_matrix(matrix) { }
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse)
+
+    inline int rows() const { return m_matrix.rows(); }
+    inline int cols() const { return m_matrix.cols(); }
+
+    inline Scalar& coeffRef(int row, int col)
+    {
+      return m_matrix.const_cast_derived().coeffRef(ReverseRow ? m_matrix.rows() - row - 1 : row,
+                                                    ReverseCol ? m_matrix.cols() - col - 1 : col);
+    }
+
+    inline const Scalar coeff(int row, int col) const
+    {
+      return m_matrix.coeff(ReverseRow ? m_matrix.rows() - row - 1 : row,
+                            ReverseCol ? m_matrix.cols() - col - 1 : col);
+    }
+
+    inline const Scalar coeff(int index) const
+    {
+      return m_matrix.coeff(m_matrix.size() - index - 1);
+    }
+
+    inline Scalar& coeffRef(int index)
+    {
+      return m_matrix.const_cast_derived().coeffRef(m_matrix.size() - index - 1);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(int row, int col) const
+    {
+      return reverse_packet::run(m_matrix.template packet<LoadMode>(
+                                    ReverseRow ? m_matrix.rows() - row - OffsetRow : row,
+                                    ReverseCol ? m_matrix.cols() - col - OffsetCol : col));
+    }
+
+    template<int LoadMode>
+    inline void writePacket(int row, int col, const PacketScalar& x)
+    {
+      m_matrix.const_cast_derived().template writePacket<LoadMode>(
+                                      ReverseRow ? m_matrix.rows() - row - OffsetRow : row,
+                                      ReverseCol ? m_matrix.cols() - col - OffsetCol : col,
+                                      reverse_packet::run(x));
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(int index) const
+    {
+      return ei_preverse(m_matrix.template packet<LoadMode>( m_matrix.size() - index - PacketSize ));
+    }
+
+    template<int LoadMode>
+    inline void writePacket(int index, const PacketScalar& x)
+    {
+      m_matrix.const_cast_derived().template writePacket<LoadMode>(m_matrix.size() - index - PacketSize, ei_preverse(x));
+    }
+
+  protected:
+    const typename MatrixType::Nested m_matrix;
+};
+
+/** \returns an expression of the reverse of *this.
+  *
+  * Example: \include MatrixBase_reverse.cpp
+  * Output: \verbinclude MatrixBase_reverse.out
+  *
+  */
+template<typename Derived>
+inline Reverse<Derived, BothDirections>
+MatrixBase<Derived>::reverse()
+{
+  return derived();
+}
+
+/** This is the const version of reverse(). */
+template<typename Derived>
+inline const Reverse<Derived, BothDirections>
+MatrixBase<Derived>::reverse() const
+{
+  return derived();
+}
+
+/** This is the "in place" version of reverse: it reverses \c *this.
+  *
+  * In most cases it is probably better to simply use the reversed expression
+  * of a matrix. However, when reversing the matrix data itself is really needed,
+  * then this "in-place" version is probably the right choice because it provides
+  * the following additional features:
+  *  - less error prone: doing the same operation with .reverse() requires special care:
+  *    \code m = m.reverse().eval(); \endcode
+  *  - no temporary object is created (currently there is one created but could be avoided using swap)
+  *  - it allows future optimizations (cache friendliness, etc.)
+  *
+  * \sa reverse() */
+template<typename Derived>
+inline void MatrixBase<Derived>::reverseInPlace()
+{
+  derived() = derived().reverse().eval();
+}
+
+
+#endif // EIGEN_REVERSE_H
--- a/Eigen/src/Array/Select.h
+++ b/Eigen/src/Array/Select.h
@@ -25,7 +25,7 @@
 #ifndef EIGEN_SELECT_H
 #define EIGEN_SELECT_H

-/** \array_module \ingroup Array
+/** \array_module \ingroup Array_Module
  *
  * \class Select
  *
--- a/Eigen/src/Cholesky/CMakeLists.txt
+++ b/Eigen/src/Cholesky/CMakeLists.txt
@@ -2,5 +2,5 @@ FILE(GLOB Eigen_Cholesky_SRCS "*.h")

 INSTALL(FILES
  ${Eigen_Cholesky_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Cholesky
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Cholesky COMPONENT Devel
  )
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -2,6 +2,8 @@
 // for linear algebra. Eigen itself is part of the KDE project.
 //
 // Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2009 Keir Mierle <mierle@gmail.com>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // Eigen is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
@@ -29,22 +31,27 @@
  *
  * \class LDLT
  *
-  * \brief Robust Cholesky decomposition of a matrix and associated features
+  * \brief Robust Cholesky decomposition of a matrix
  *
-  * \param MatrixType the type of the matrix of which we are computing the LDL^T Cholesky decomposition
+  * \param MatrixType the type of the matrix of which to compute the LDL^T Cholesky decomposition
  *
-  * This class performs a Cholesky decomposition without square root of a symmetric, positive definite
-  * matrix A such that A = L D L^* = U^* D U, where L is lower triangular with a unit diagonal
-  * and D is a diagonal matrix.
+  * Perform a robust Cholesky decomposition of a positive semidefinite or negative semidefinite
+  * matrix \f$ A \f$ such that \f$ A =  P^TLDL^*P \f$, where P is a permutation matrix, L
+  * is lower triangular with a unit diagonal and D is a diagonal matrix.
  *
-  * Compared to a standard Cholesky decomposition, avoiding the square roots allows for faster and more
-  * stable computation.
+  * The decomposition uses pivoting to ensure stability, so that L will have
+  * zeros in the bottom right rank(A) - n submatrix. Avoiding the square root
+  * on D also stabilizes the computation.
  *
-  * Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
-  * the strict lower part does not have to store correct values.
+  * Remember that Cholesky decompositions are not rank-revealing.  Also, do not use a Cholesky decomposition to determine
+  * whether a system of equations has a solution.
  *
  * \sa MatrixBase::ldlt(), class LLT
  */
+ /* THIS PART OF THE DOX IS CURRENTLY DISABLED BECAUSE INACCURATE BECAUSE OF BUG IN THE DECOMPOSITION CODE
+  * Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
+  * the strict lower part does not have to store correct values.
+  */
 template<typename MatrixType> class LDLT
 {
  public:
@@ -52,9 +59,13 @@ template<typename MatrixType> class LDLT
    typedef typename MatrixType::Scalar Scalar;
    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
    typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> VectorType;
+    typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
+    typedef Matrix<int, 1, MatrixType::RowsAtCompileTime> IntRowVectorType;

    LDLT(const MatrixType& matrix)
-      : m_matrix(matrix.rows(), matrix.cols())
+      : m_matrix(matrix.rows(), matrix.cols()),
+        m_p(matrix.rows()),
+        m_transpositions(matrix.rows())
    {
      compute(matrix);
    }
@@ -62,11 +73,23 @@ template<typename MatrixType> class LDLT
    /** \returns the lower triangular matrix L */
    inline Part<MatrixType, UnitLowerTriangular> matrixL(void) const { return m_matrix; }

-    /** \returns the coefficients of the diagonal matrix D */
-    inline DiagonalCoeffs<MatrixType> vectorD(void) const { return m_matrix.diagonal(); }
+    /** \returns a vector of integers, whose size is the number of rows of the matrix being decomposed,
+      * representing the P permutation i.e. the permutation of the rows. For its precise meaning,
+      * see the examples given in the documentation of class LU.
+      */
+    inline const IntColVectorType& permutationP() const
+    {
+      return m_p;
+    }

-    /** \returns true if the matrix is positive definite */
-    inline bool isPositiveDefinite(void) const { return m_isPositiveDefinite; }
+    /** \returns the coefficients of the diagonal matrix D */
+    inline Diagonal<MatrixType,0> vectorD(void) const { return m_matrix.diagonal(); }
+
+    /** \returns true if the matrix is positive (semidefinite) */
+    inline bool isPositive(void) const { return m_sign == 1; }
+
+    /** \returns true if the matrix is negative (semidefinite) */
+    inline bool isNegative(void) const { return m_sign == -1; }

    template<typename RhsDerived, typename ResDerived>
    bool solve(const MatrixBase<RhsDerived> &b, MatrixBase<ResDerived> *result) const;
@@ -78,76 +101,119 @@ template<typename MatrixType> class LDLT

  protected:
    /** \internal
-      * Used to compute and store the cholesky decomposition A = L D L^* = U^* D U.
+      * Used to compute and store the Cholesky decomposition A = L D L^* = U^* D U.
      * The strict upper part is used during the decomposition, the strict lower
      * part correspond to the coefficients of L (its diagonal is equal to 1 and
      * is not stored), and the diagonal entries correspond to D.
      */
    MatrixType m_matrix;
-
-    bool m_isPositiveDefinite;
+    IntColVectorType m_p;
+    IntColVectorType m_transpositions;
+    int m_sign;
 };

-/** Compute / recompute the LLT decomposition A = L D L^* = U^* D U of \a matrix
+/** Compute / recompute the LDLT decomposition A = L D L^* = U^* D U of \a matrix
  */
 template<typename MatrixType>
 void LDLT<MatrixType>::compute(const MatrixType& a)
 {
-  assert(a.rows()==a.cols());
+  ei_assert(a.rows()==a.cols());
  const int size = a.rows();
-  m_matrix.resize(size, size);
-  m_isPositiveDefinite = true;
-  const RealScalar eps = ei_sqrt(precision<Scalar>());

-  if (size<=1)
-  {
-    m_matrix = a;
+  m_matrix = a;
+
+  if (size <= 1) {
+    m_p.setZero();
+    m_transpositions.setZero();
+    m_sign = ei_real(a.coeff(0,0))>0 ? 1:-1;
    return;
  }

-  // Let's preallocate a temporay vector to evaluate the matrix-vector product into it.
-  // Unlike the standard LLT decomposition, here we cannot evaluate it to the destination
-  // matrix because it a sub-row which is not compatible suitable for efficient packet evaluation.
-  // (at least if we assume the matrix is col-major)
+  RealScalar cutoff = 0, biggest_in_corner;
+
+  // By using a temorary, packet-aligned products are guarenteed. In the LLT
+  // case this is unnecessary because the diagonal is included and will always
+  // have optimal alignment.
  Matrix<Scalar,MatrixType::RowsAtCompileTime,1> _temporary(size);

-  // Note that, in this algorithm the rows of the strict upper part of m_matrix is used to store
-  // column vector, thus the strange .conjugate() and .transpose()...
-
-  m_matrix.row(0) = a.row(0).conjugate();
-  m_matrix.col(0).end(size-1) = m_matrix.row(0).end(size-1) / m_matrix.coeff(0,0);
-  for (int j = 1; j < size; ++j)
+  for (int j = 0; j < size; ++j)
  {
-    RealScalar tmp = ei_real(a.coeff(j,j) - (m_matrix.row(j).start(j) * m_matrix.col(j).start(j).conjugate()).coeff(0,0));
-    m_matrix.coeffRef(j,j) = tmp;
+    // Find largest diagonal element
+    int index_of_biggest_in_corner;
+    biggest_in_corner = m_matrix.diagonal().end(size-j).cwise().abs()
+                       .maxCoeff(&index_of_biggest_in_corner);
+    index_of_biggest_in_corner += j;

-    if (tmp < eps)
+    if(j == 0)
    {
-      m_isPositiveDefinite = false;
-      return;
+      // The biggest overall is the point of reference to which further diagonals
+      // are compared; if any diagonal is negligible compared
+      // to the largest overall, the algorithm bails.  This cutoff is suggested
+      // in "Analysis of the Cholesky Decomposition of a Semi-definite Matrix" by
+      // Nicholas J. Higham. Also see "Accuracy and Stability of Numerical
+      // Algorithms" page 217, also by Higham.
+      cutoff = ei_abs(machine_epsilon<Scalar>() * size * biggest_in_corner);
+
+      m_sign = ei_real(m_matrix.diagonal().coeff(index_of_biggest_in_corner)) > 0 ? 1 : -1;
    }

-    int endSize = size-j-1;
-    if (endSize>0)
+    // Finish early if the matrix is not full rank.
+    if(biggest_in_corner < cutoff)
    {
-      _temporary.end(endSize) = ( m_matrix.block(j+1,0, endSize, j)
-                                  * m_matrix.col(j).start(j).conjugate() ).lazy();
+      for(int i = j; i < size; i++) m_transpositions.coeffRef(i) = i;
+      break;
+    }

-      m_matrix.row(j).end(endSize) = a.row(j).end(endSize).conjugate()
+    m_transpositions.coeffRef(j) = index_of_biggest_in_corner;
+    if(j != index_of_biggest_in_corner)
+    {
+      m_matrix.row(j).swap(m_matrix.row(index_of_biggest_in_corner));
+      m_matrix.col(j).swap(m_matrix.col(index_of_biggest_in_corner));
+    }
+
+    if (j == 0) {
+      m_matrix.row(0) = m_matrix.row(0).conjugate();
+      m_matrix.col(0).end(size-1) = m_matrix.row(0).end(size-1) / m_matrix.coeff(0,0);
+      continue;
+    }
+
+    RealScalar Djj = ei_real(m_matrix.coeff(j,j) - (m_matrix.row(j).start(j)
+                                                  * m_matrix.col(j).start(j).conjugate()).coeff(0,0));
+    m_matrix.coeffRef(j,j) = Djj;
+
+    // Finish early if the matrix is not full rank.
+    if(ei_abs(Djj) < cutoff)
+    {
+      for(int i = j; i < size; i++) m_transpositions.coeffRef(i) = i;
+      break;
+    }
+
+    int endSize = size - j - 1;
+    if (endSize > 0) {
+      _temporary.end(endSize) = ( m_matrix.block(j+1,0, endSize, j)
+                                * m_matrix.col(j).start(j).conjugate() ).lazy();
+
+      m_matrix.row(j).end(endSize) = m_matrix.row(j).end(endSize).conjugate()
                                   - _temporary.end(endSize).transpose();

-      m_matrix.col(j).end(endSize) = m_matrix.row(j).end(endSize) / tmp;
+      m_matrix.col(j).end(endSize) = m_matrix.row(j).end(endSize) / Djj;
    }
  }
+
+  // Reverse applied swaps to get P matrix.
+  for(int k = 0; k < size; ++k) m_p.coeffRef(k) = k;
+  for(int k = size-1; k >= 0; --k) {
+    std::swap(m_p.coeffRef(k), m_p.coeffRef(m_transpositions.coeff(k)));
+  }
 }

 /** Computes the solution x of \f$ A x = b \f$ using the current decomposition of A.
  * The result is stored in \a result
  *
-  * \returns true in case of success, false otherwise.
+  * \returns true always! If you need to check for existence of solutions, use another decomposition like LU, QR, or SVD.
  *
  * In other words, it computes \f$ b = A^{-1} b \f$ with
-  * \f$ {L^{*}}^{-1} D^{-1} L^{-1} b \f$ from right to left.
+  * \f$ P^T{L^{*}}^{-1} D^{-1} L^{-1} P b \f$ from right to left.
  *
  * \sa LDLT::solveInPlace(), MatrixBase::ldlt()
  */
@@ -157,7 +223,7 @@ bool LDLT<MatrixType>
 ::solve(const MatrixBase<RhsDerived> &b, MatrixBase<ResDerived> *result) const
 {
  const int size = m_matrix.rows();
-  ei_assert(size==b.rows() && "LLT::solve(): invalid number of rows of the right hand side matrix b");
+  ei_assert(size==b.rows() && "LDLT::solve(): invalid number of rows of the right hand side matrix b");
  *result = b;
  return solveInPlace(*result);
 }
@@ -166,6 +232,8 @@ bool LDLT<MatrixType>
  *
  * \param bAndX represents both the right-hand side matrix b and result x.
  *
+  * \returns true always! If you need to check for existence of solutions, use another decomposition like LU, QR, or SVD.
+  *
  * This version avoids a copy when the right hand side matrix b is not
  * needed anymore.
  *
@@ -176,17 +244,28 @@ template<typename Derived>
 bool LDLT<MatrixType>::solveInPlace(MatrixBase<Derived> &bAndX) const
 {
  const int size = m_matrix.rows();
-  ei_assert(size==bAndX.rows());
-  if (!m_isPositiveDefinite)
-    return false;
+  ei_assert(size == bAndX.rows());
+
+  // z = P b
+  for(int i = 0; i < size; ++i) bAndX.row(m_transpositions.coeff(i)).swap(bAndX.row(i));
+
+  // y = L^-1 z
  matrixL().solveTriangularInPlace(bAndX);
-  bAndX = (m_matrix.cwise().inverse().template part<Diagonal>() * bAndX).lazy();
+
+  // w = D^-1 y
+  bAndX = (m_matrix.diagonal().cwise().inverse().asDiagonal() * bAndX).lazy();
+
+  // u = L^-T w
  m_matrix.adjoint().template part<UnitUpperTriangular>().solveTriangularInPlace(bAndX);
+
+  // x = P^T u
+  for (int i = size-1; i >= 0; --i) bAndX.row(m_transpositions.coeff(i)).swap(bAndX.row(i));
+
  return true;
 }

 /** \cholesky_module
-  * \returns the Cholesky decomposition without square root of \c *this
+  * \returns the Cholesky decomposition with full pivoting without square root of \c *this
  */
 template<typename Derived>
 inline const LDLT<typename MatrixBase<Derived>::PlainMatrixType>
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -41,11 +41,16 @@
  * and even faster. Nevertheless, this standard Cholesky decomposition remains useful in many other
  * situations like generalised eigen problems with hermitian matrices.
  *
-  * Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
-  * the strict lower part does not have to store correct values.
+  * Remember that Cholesky decompositions are not rank-revealing. This LLT decomposition is only stable on positive definite matrices,
+  * use LDLT instead for the semidefinite case. Also, do not use a Cholesky decomposition to determine whether a system of equations
+  * has a solution.
  *
  * \sa MatrixBase::llt(), class LDLT
  */
+ /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
+  * Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
+  * the strict lower part does not have to store correct values.
+  */
 template<typename MatrixType> class LLT
 {
  private:
@@ -69,9 +74,6 @@ template<typename MatrixType> class LLT
    /** \returns the lower triangular matrix L */
    inline Part<MatrixType, LowerTriangular> matrixL(void) const { return m_matrix; }

-    /** \returns true if the matrix is positive definite */
-    inline bool isPositiveDefinite(void) const { return m_isPositiveDefinite; }
-
    template<typename RhsDerived, typename ResDerived>
    bool solve(const MatrixBase<RhsDerived> &b, MatrixBase<ResDerived> *result) const;

@@ -86,7 +88,6 @@ template<typename MatrixType> class LLT
      * The strict upper part is not used and even not initialized.
      */
    MatrixType m_matrix;
-    bool m_isPositiveDefinite;
 };

 /** Computes / recomputes the Cholesky decomposition A = LL^* = U^*U of \a matrix
@@ -97,22 +98,24 @@ void LLT<MatrixType>::compute(const MatrixType& a)
  assert(a.rows()==a.cols());
  const int size = a.rows();
  m_matrix.resize(size, size);
-  const RealScalar eps = ei_sqrt(precision<Scalar>());
-
+  // The biggest overall is the point of reference to which further diagonals
+  // are compared; if any diagonal is negligible compared
+  // to the largest overall, the algorithm bails.  This cutoff is suggested
+  // in "Analysis of the Cholesky Decomposition of a Semi-definite Matrix" by
+  // Nicholas J. Higham. Also see "Accuracy and Stability of Numerical
+  // Algorithms" page 217, also by Higham.
+  const RealScalar cutoff = machine_epsilon<Scalar>() * size * a.diagonal().cwise().abs().maxCoeff();
  RealScalar x;
  x = ei_real(a.coeff(0,0));
-  m_isPositiveDefinite = x > eps && ei_isMuchSmallerThan(ei_imag(a.coeff(0,0)), RealScalar(1));
  m_matrix.coeffRef(0,0) = ei_sqrt(x);
+  if(size==1)
+    return;
  m_matrix.col(0).end(size-1) = a.row(0).end(size-1).adjoint() / ei_real(m_matrix.coeff(0,0));
  for (int j = 1; j < size; ++j)
  {
-    Scalar tmp = ei_real(a.coeff(j,j)) - m_matrix.row(j).start(j).squaredNorm();
-    x = ei_real(tmp);
-    if (x < eps || (!ei_isMuchSmallerThan(ei_imag(tmp), RealScalar(1))))
-    {
-      m_isPositiveDefinite = false;
-      return;
-    }
+    x = ei_real(a.coeff(j,j)) - m_matrix.row(j).start(j).squaredNorm();
+    if (ei_abs(x) < cutoff) continue;
+
    m_matrix.coeffRef(j,j) = x = ei_sqrt(x);

    int endSize = size-j-1;
@@ -132,7 +135,7 @@ void LLT<MatrixType>::compute(const MatrixType& a)
 /** Computes the solution x of \f$ A x = b \f$ using the current decomposition of A.
  * The result is stored in \a result
  *
-  * \returns true in case of success, false otherwise.
+  * \returns true always! If you need to check for existence of solutions, use another decomposition like LU, QR, or SVD.
  *
  * In other words, it computes \f$ b = A^{-1} b \f$ with
  * \f$ {L^{*}}^{-1} L^{-1} b \f$ from right to left.
@@ -155,6 +158,8 @@ bool LLT<MatrixType>::solve(const MatrixBase<RhsDerived> &b, MatrixBase<ResDeriv
  *
  * \param bAndX represents both the right-hand side matrix b and result x.
  *
+  * \returns true always! If you need to check for existence of solutions, use another decomposition like LU, QR, or SVD.
+  *
  * This version avoids a copy when the right hand side matrix b is not
  * needed anymore.
  *
@@ -166,8 +171,6 @@ bool LLT<MatrixType>::solveInPlace(MatrixBase<Derived> &bAndX) const
 {
  const int size = m_matrix.rows();
  ei_assert(size==bAndX.rows());
-  if (!m_isPositiveDefinite)
-    return false;
  matrixL().solveTriangularInPlace(bAndX);
  m_matrix.adjoint().template part<UpperTriangular>().solveTriangularInPlace(bAndX);
  return true;
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -61,7 +61,6 @@
  *
  * \sa MatrixBase::block(int,int,int,int), MatrixBase::block(int,int), class VectorBlock
  */
-
 template<typename MatrixType, int BlockRows, int BlockCols, int _PacketAccess, int _DirectAccessStatus>
 struct ei_traits<Block<MatrixType, BlockRows, BlockCols, _PacketAccess, _DirectAccessStatus> >
 {
@@ -69,8 +68,8 @@ struct ei_traits<Block<MatrixType, BlockRows, BlockCols, _PacketAccess, _DirectA
  typedef typename ei_nested<MatrixType>::type MatrixTypeNested;
  typedef typename ei_unref<MatrixTypeNested>::type _MatrixTypeNested;
  enum{
-    RowsAtCompileTime = ei_traits<MatrixType>::RowsAtCompileTime == 1 ? 1 : BlockRows,
-    ColsAtCompileTime = ei_traits<MatrixType>::ColsAtCompileTime == 1 ? 1 : BlockCols,
+    RowsAtCompileTime = BlockRows,
+    ColsAtCompileTime = BlockCols,
    MaxRowsAtCompileTime = RowsAtCompileTime == 1 ? 1
      : (BlockRows==Dynamic ? int(ei_traits<MatrixType>::MaxRowsAtCompileTime) : BlockRows),
    MaxColsAtCompileTime = ColsAtCompileTime == 1 ? 1
@@ -153,7 +152,7 @@ template<typename MatrixType, int BlockRows, int BlockCols, int PacketAccess, in
               .coeffRef(row + m_startRow.value(), col + m_startCol.value());
    }

-    inline const Scalar coeff(int row, int col) const
+    inline const CoeffReturnType coeff(int row, int col) const
    {
      return m_matrix.coeff(row + m_startRow.value(), col + m_startCol.value());
    }
@@ -165,7 +164,7 @@ template<typename MatrixType, int BlockRows, int BlockCols, int PacketAccess, in
                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
    }

-    inline const Scalar coeff(int index) const
+    inline const CoeffReturnType coeff(int index) const
    {
      return m_matrix
             .coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
--- a/Eigen/src/Core/CMakeLists.txt
+++ b/Eigen/src/Core/CMakeLists.txt
@@ -2,8 +2,16 @@ FILE(GLOB Eigen_Core_SRCS "*.h")

 INSTALL(FILES
  ${Eigen_Core_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core COMPONENT Devel
  )

+FILE(GLOB Eigen_Core_Product_SRCS "products/*.h")
+
+INSTALL(FILES
+  ${Eigen_Core_Product_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/products COMPONENT Devel
+  )
+
+
 ADD_SUBDIRECTORY(util)
 ADD_SUBDIRECTORY(arch)
--- a/Eigen/src/Core/Coeffs.h
+++ b/Eigen/src/Core/Coeffs.h
@@ -40,7 +40,7 @@
  * \sa operator()(int,int) const, coeffRef(int,int), coeff(int) const
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
+EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::CoeffReturnType MatrixBase<Derived>
  ::coeff(int row, int col) const
 {
  ei_internal_assert(row >= 0 && row < rows()
@@ -53,7 +53,7 @@ EIGEN_STRONG_INLINE const typename ei_traits<Derived>::Scalar MatrixBase<Derived
  * \sa operator()(int,int), operator[](int) const
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
+EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::CoeffReturnType MatrixBase<Derived>
  ::operator()(int row, int col) const
 {
  ei_assert(row >= 0 && row < rows()
@@ -112,7 +112,7 @@ EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
  * \sa operator[](int) const, coeffRef(int), coeff(int,int) const
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
+EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::CoeffReturnType MatrixBase<Derived>
  ::coeff(int index) const
 {
  ei_internal_assert(index >= 0 && index < size());
@@ -127,7 +127,7 @@ EIGEN_STRONG_INLINE const typename ei_traits<Derived>::Scalar MatrixBase<Derived
  * z() const, w() const
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
+EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::CoeffReturnType MatrixBase<Derived>
  ::operator[](int index) const
 {
  ei_assert(index >= 0 && index < size());
@@ -144,7 +144,7 @@ EIGEN_STRONG_INLINE const typename ei_traits<Derived>::Scalar MatrixBase<Derived
  * z() const, w() const
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
+EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::CoeffReturnType MatrixBase<Derived>
  ::operator()(int index) const
 {
  ei_assert(index >= 0 && index < size());
@@ -205,22 +205,22 @@ EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& MatrixBase<Derived>

 /** equivalent to operator[](0).  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
+EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::CoeffReturnType MatrixBase<Derived>
  ::x() const { return (*this)[0]; }

 /** equivalent to operator[](1).  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
+EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::CoeffReturnType MatrixBase<Derived>
  ::y() const { return (*this)[1]; }

 /** equivalent to operator[](2).  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
+EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::CoeffReturnType MatrixBase<Derived>
  ::z() const { return (*this)[2]; }

 /** equivalent to operator[](3).  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
+EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::CoeffReturnType MatrixBase<Derived>
  ::w() const { return (*this)[3]; }

 /** equivalent to operator[](0).  */
--- a/Eigen/src/Core/Cwise.h
+++ b/Eigen/src/Core/Cwise.h
@@ -64,7 +64,7 @@
  * It is the return type of MatrixBase::cwise()
  * and most of the time this is the only way it is used.
  *
-  * Note that some methods are defined in the \ref Array module.
+  * Note that some methods are defined in the \ref Array_Module array module.
  *
  * Example: \include MatrixBase_cwise_const.cpp
  * Output: \verbinclude MatrixBase_cwise_const.out
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@@ -188,7 +188,11 @@ MatrixBase<Derived>::imag() const { return derived(); }
  */
 template<typename Derived>
 template<typename NewType>
-EIGEN_STRONG_INLINE const CwiseUnaryOp<ei_scalar_cast_op<typename ei_traits<Derived>::Scalar, NewType>, Derived>
+EIGEN_STRONG_INLINE
+typename ei_cast_return_type<
+            Derived,
+            const CwiseUnaryOp<ei_scalar_cast_op<typename ei_traits<Derived>::Scalar, NewType>, Derived>
+          >::type
 MatrixBase<Derived>::cast() const
 {
  return derived();
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -0,0 +1,187 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_DIAGONAL_H
+#define EIGEN_DIAGONAL_H
+
+/** \class Diagonal
+  *
+  * \brief Expression of a diagonal/subdiagonal/superdiagonal in a matrix
+  *
+  * \param MatrixType the type of the object in which we are taking a sub/main/super diagonal
+  * \param Index the index of the sub/super diagonal. The default is 0 and it means the main diagonal.
+  *              A positive value means a superdiagonal, a negative value means a subdiagonal.
+  *              You can also use Dynamic so the index can be set at runtime.
+  *
+  * The matrix is not required to be square.
+  *
+  * This class represents an expression of the main diagonal, or any sub/super diagonal
+  * of a square matrix. It is the return type of MatrixBase::diagonal() and MatrixBase::diagonal(int) and most of the
+  * time this is the only way it is used.
+  *
+  * \sa MatrixBase::diagonal(), MatrixBase::diagonal(int)
+  */
+template<typename MatrixType, int Index>
+struct ei_traits<Diagonal<MatrixType,Index> >
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename ei_nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ei_unref<MatrixTypeNested>::type _MatrixTypeNested;
+  enum {
+    AbsIndex = Index<0 ? -Index : Index, // only used if Index != Dynamic
+    RowsAtCompileTime = (int(Index) == Dynamic || int(MatrixType::SizeAtCompileTime) == Dynamic) ? Dynamic
+                      : (EIGEN_ENUM_MIN(MatrixType::RowsAtCompileTime,
+                                        MatrixType::ColsAtCompileTime) - AbsIndex),
+    ColsAtCompileTime = 1,
+    MaxRowsAtCompileTime = int(MatrixType::MaxSizeAtCompileTime) == Dynamic ? Dynamic
+                         : (EIGEN_ENUM_MIN(MatrixType::MaxRowsAtCompileTime,
+                                          MatrixType::MaxColsAtCompileTime) - AbsIndex),
+    MaxColsAtCompileTime = 1,
+    Flags = (unsigned int)_MatrixTypeNested::Flags & (HereditaryBits | LinearAccessBit),
+    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+  };
+};
+
+template<typename MatrixType, int Index> class Diagonal
+   : public MatrixBase<Diagonal<MatrixType, Index> >
+{
+    // some compilers may fail to optimize std::max etc in case of compile-time constants...
+    EIGEN_STRONG_INLINE int absIndex() const { return m_index.value()>0 ? m_index.value() : -m_index.value(); }
+    EIGEN_STRONG_INLINE int rowOffset() const { return m_index.value()>0 ? 0 : -m_index.value(); }
+    EIGEN_STRONG_INLINE int colOffset() const { return m_index.value()>0 ? m_index.value() : 0; }
+    
+  public:
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(Diagonal)
+
+    inline Diagonal(const MatrixType& matrix, int index = Index) : m_matrix(matrix), m_index(index) {}
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)
+
+    inline int rows() const{ return m_matrix.diagonalSize() - absIndex(); }
+    inline int cols() const { return 1; }
+
+    inline Scalar& coeffRef(int row, int)
+    {
+      return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
+    }
+
+    inline const Scalar coeff(int row, int) const
+    {
+      return m_matrix.coeff(row+rowOffset(), row+colOffset());
+    }
+
+    inline Scalar& coeffRef(int index)
+    {
+      return m_matrix.const_cast_derived().coeffRef(index+rowOffset(), index+colOffset());
+    }
+
+    inline const Scalar coeff(int index) const
+    {
+      return m_matrix.coeff(index+rowOffset(), index+colOffset());
+    }
+
+  protected:
+    const typename MatrixType::Nested m_matrix;
+    const ei_int_if_dynamic<Index> m_index;
+};
+
+/** \returns an expression of the main diagonal of the matrix \c *this
+  *
+  * \c *this is not required to be square.
+  *
+  * Example: \include MatrixBase_diagonal.cpp
+  * Output: \verbinclude MatrixBase_diagonal.out
+  *
+  * \sa class Diagonal */
+template<typename Derived>
+inline Diagonal<Derived, 0>
+MatrixBase<Derived>::diagonal()
+{
+  return Diagonal<Derived, 0>(derived());
+}
+
+/** This is the const version of diagonal(). */
+template<typename Derived>
+inline const Diagonal<Derived, 0>
+MatrixBase<Derived>::diagonal() const
+{
+  return Diagonal<Derived, 0>(derived());
+}
+
+/** \returns an expression of the \a Index-th sub or super diagonal of the matrix \c *this
+  *
+  * \c *this is not required to be square.
+  *
+  * The template parameter \a Index represent a super diagonal if \a Index > 0
+  * and a sub diagonal otherwise. \a Index == 0 is equivalent to the main diagonal.
+  *
+  * Example: \include MatrixBase_diagonal_int.cpp
+  * Output: \verbinclude MatrixBase_diagonal_int.out
+  *
+  * \sa MatrixBase::diagonal(), class Diagonal */
+template<typename Derived>
+inline Diagonal<Derived, Dynamic>
+MatrixBase<Derived>::diagonal(int index)
+{
+  return Diagonal<Derived, Dynamic>(derived(), index);
+}
+
+/** This is the const version of diagonal(int). */
+template<typename Derived>
+inline const Diagonal<Derived, Dynamic>
+MatrixBase<Derived>::diagonal(int index) const
+{
+  return Diagonal<Derived, Dynamic>(derived(), index);
+}
+
+/** \returns an expression of the \a Index-th sub or super diagonal of the matrix \c *this
+  *
+  * \c *this is not required to be square.
+  *
+  * The template parameter \a Index represent a super diagonal if \a Index > 0
+  * and a sub diagonal otherwise. \a Index == 0 is equivalent to the main diagonal.
+  *
+  * Example: \include MatrixBase_diagonal_template_int.cpp
+  * Output: \verbinclude MatrixBase_diagonal_template_int.out
+  *
+  * \sa MatrixBase::diagonal(), class Diagonal */
+template<typename Derived>
+template<int Index>
+inline Diagonal<Derived,Index>
+MatrixBase<Derived>::diagonal()
+{
+  return Diagonal<Derived,Index>(derived());
+}
+
+/** This is the const version of diagonal<int>(). */
+template<typename Derived>
+template<int Index>
+inline const Diagonal<Derived,Index>
+MatrixBase<Derived>::diagonal() const
+{
+  return Diagonal<Derived,Index>(derived());
+}
+
+#endif // EIGEN_DIAGONAL_H
--- a/Eigen/src/Core/DiagonalCoeffs.h
+++ b/Eigen/src/Core/DiagonalCoeffs.h
@@ -1,124 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// Eigen is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 3 of the License, or (at your option) any later version.
-//
-// Alternatively, you can redistribute it and/or
-// modify it under the terms of the GNU General Public License as
-// published by the Free Software Foundation; either version 2 of
-// the License, or (at your option) any later version.
-//
-// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
-// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU Lesser General Public
-// License and a copy of the GNU General Public License along with
-// Eigen. If not, see <http://www.gnu.org/licenses/>.
-
-#ifndef EIGEN_DIAGONALCOEFFS_H
-#define EIGEN_DIAGONALCOEFFS_H
-
-/** \class DiagonalCoeffs
-  *
-  * \brief Expression of the main diagonal of a matrix
-  *
-  * \param MatrixType the type of the object in which we are taking the main diagonal
-  *
-  * The matrix is not required to be square.
-  *
-  * This class represents an expression of the main diagonal of a square matrix.
-  * It is the return type of MatrixBase::diagonal() and most of the time this is
-  * the only way it is used.
-  *
-  * \sa MatrixBase::diagonal()
-  */
-template<typename MatrixType>
-struct ei_traits<DiagonalCoeffs<MatrixType> >
-{
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename ei_nested<MatrixType>::type MatrixTypeNested;
-  typedef typename ei_unref<MatrixTypeNested>::type _MatrixTypeNested;
-  enum {
-    RowsAtCompileTime = int(MatrixType::SizeAtCompileTime) == Dynamic ? Dynamic
-                      : EIGEN_ENUM_MIN(MatrixType::RowsAtCompileTime,
-                                       MatrixType::ColsAtCompileTime),
-    ColsAtCompileTime = 1,
-    MaxRowsAtCompileTime = int(MatrixType::MaxSizeAtCompileTime) == Dynamic ? Dynamic
-                            : EIGEN_ENUM_MIN(MatrixType::MaxRowsAtCompileTime,
-                                             MatrixType::MaxColsAtCompileTime),
-    MaxColsAtCompileTime = 1,
-    Flags = (unsigned int)_MatrixTypeNested::Flags & (HereditaryBits | LinearAccessBit),
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
-  };
-};
-
-template<typename MatrixType> class DiagonalCoeffs
-   : public MatrixBase<DiagonalCoeffs<MatrixType> >
-{
-  public:
-
-    EIGEN_GENERIC_PUBLIC_INTERFACE(DiagonalCoeffs)
-
-    inline DiagonalCoeffs(const MatrixType& matrix) : m_matrix(matrix) {}
-
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(DiagonalCoeffs)
-
-    inline int rows() const { return std::min(m_matrix.rows(), m_matrix.cols()); }
-    inline int cols() const { return 1; }
-
-    inline Scalar& coeffRef(int row, int)
-    {
-      return m_matrix.const_cast_derived().coeffRef(row, row);
-    }
-
-    inline const Scalar coeff(int row, int) const
-    {
-      return m_matrix.coeff(row, row);
-    }
-
-    inline Scalar& coeffRef(int index)
-    {
-      return m_matrix.const_cast_derived().coeffRef(index, index);
-    }
-
-    inline const Scalar coeff(int index) const
-    {
-      return m_matrix.coeff(index, index);
-    }
-
-  protected:
-
-    const typename MatrixType::Nested m_matrix;
-};
-
-/** \returns an expression of the main diagonal of the matrix \c *this
-  *
-  * \c *this is not required to be square.
-  *
-  * Example: \include MatrixBase_diagonal.cpp
-  * Output: \verbinclude MatrixBase_diagonal.out
-  *
-  * \sa class DiagonalCoeffs */
-template<typename Derived>
-inline DiagonalCoeffs<Derived>
-MatrixBase<Derived>::diagonal()
-{
-  return DiagonalCoeffs<Derived>(derived());
-}
-
-/** This is the const version of diagonal(). */
-template<typename Derived>
-inline const DiagonalCoeffs<Derived>
-MatrixBase<Derived>::diagonal() const
-{
-  return DiagonalCoeffs<Derived>(derived());
-}
-
-#endif // EIGEN_DIAGONALCOEFFS_H
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -1,6 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra. Eigen itself is part of the KDE project.
 //
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // Eigen is free software; you can redistribute it and/or
@@ -25,57 +26,92 @@
 #ifndef EIGEN_DIAGONALMATRIX_H
 #define EIGEN_DIAGONALMATRIX_H

-/** \class DiagonalMatrix
-  * \nonstableyet 
-  *
-  * \brief Expression of a diagonal matrix
-  *
-  * \param CoeffsVectorType the type of the vector of diagonal coefficients
-  *
-  * This class is an expression of a diagonal matrix with given vector of diagonal
-  * coefficients. It is the return
-  * type of MatrixBase::diagonal(const OtherDerived&) and most of the time this is
-  * the only way it is used.
-  *
-  * \sa MatrixBase::diagonal(const OtherDerived&)
-  */
-template<typename CoeffsVectorType>
-struct ei_traits<DiagonalMatrix<CoeffsVectorType> >
-{
-  typedef typename CoeffsVectorType::Scalar Scalar;
-  typedef typename ei_nested<CoeffsVectorType>::type CoeffsVectorTypeNested;
-  typedef typename ei_unref<CoeffsVectorTypeNested>::type _CoeffsVectorTypeNested;
-  enum {
-    RowsAtCompileTime = CoeffsVectorType::SizeAtCompileTime,
-    ColsAtCompileTime = CoeffsVectorType::SizeAtCompileTime,
-    MaxRowsAtCompileTime = CoeffsVectorType::MaxSizeAtCompileTime,
-    MaxColsAtCompileTime = CoeffsVectorType::MaxSizeAtCompileTime,
-    Flags = (_CoeffsVectorTypeNested::Flags & HereditaryBits) | Diagonal,
-    CoeffReadCost = _CoeffsVectorTypeNested::CoeffReadCost
-  };
-};

-template<typename CoeffsVectorType>
-class DiagonalMatrix : ei_no_assignment_operator,
-   public MatrixBase<DiagonalMatrix<CoeffsVectorType> >
+template<typename CoeffsVectorType, typename Derived>
+class DiagonalMatrixBase : ei_no_assignment_operator,
+   public MatrixBase<Derived>
 {
  public:
+    typedef MatrixBase<Derived> Base;
+    typedef typename ei_traits<Derived>::Scalar Scalar;
+    typedef typename Base::PacketScalar PacketScalar;
+    using Base::derived;
+    typedef typename ei_cleantype<CoeffsVectorType>::type _CoeffsVectorType;

-    EIGEN_GENERIC_PUBLIC_INTERFACE(DiagonalMatrix)
+  protected:

-    // needed to evaluate a DiagonalMatrix<Xpr> to a DiagonalMatrix<NestByValue<Vector> >
-    template<typename OtherCoeffsVectorType>
-    inline DiagonalMatrix(const DiagonalMatrix<OtherCoeffsVectorType>& other) : m_coeffs(other.diagonal())
+    // MSVC gets crazy if we define default parameters
+    template<typename OtherDerived, bool IsVector, bool IsDiagonal> struct construct_from_expression;
+
+    // = vector
+    template<typename OtherDerived>
+    struct construct_from_expression<OtherDerived,true,false>
    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(CoeffsVectorType);
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherCoeffsVectorType);
-      ei_assert(m_coeffs.size() > 0);
+      static void run(Derived& dst, const OtherDerived& src)
+      { dst.diagonal() = src; }
+    };
+
+    // = diagonal expression
+    template<typename OtherDerived, bool IsVector>
+    struct construct_from_expression<OtherDerived,IsVector,true>
+    {
+      static void run(Derived& dst, const OtherDerived& src)
+      { dst.diagonal() = src.diagonal(); }
+    };
+
+    /** Default constructor without initialization */
+    inline DiagonalMatrixBase() {}
+    /** Constructs a diagonal matrix with given dimension */
+    inline DiagonalMatrixBase(int dim) : m_coeffs(dim) {}
+    /** Generic constructor from an expression */
+    template<typename OtherDerived>
+    inline DiagonalMatrixBase(const MatrixBase<OtherDerived>& other)
+    {
+      construct_from_expression<OtherDerived,OtherDerived::IsVectorAtCompileTime,ei_is_diagonal<OtherDerived>::ret>
+        ::run(derived(),other.derived());
+    }
+    
+    template<typename NewType,int dummy=0>
+    struct cast_selector {
+      typedef const DiagonalMatrixWrapper<NestByValue<CwiseUnaryOp<ei_scalar_cast_op<Scalar, NewType>, _CoeffsVectorType> > > return_type;
+      inline static return_type run(const DiagonalMatrixBase& d) {
+        return d.m_coeffs.template cast<NewType>().nestByValue().asDiagonal();
+      }
+    };
+    
+    template<int dummy>
+    struct cast_selector<Scalar,dummy> {
+      typedef const Derived& return_type;
+      inline static return_type run(const DiagonalMatrixBase& d) {
+        return d.derived();
+      }
+    };
+
+  public:
+
+    inline DiagonalMatrixBase(const _CoeffsVectorType& coeffs) : m_coeffs(coeffs)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(_CoeffsVectorType);
+      ei_assert(coeffs.size() > 0);
    }

-    inline DiagonalMatrix(const CoeffsVectorType& coeffs) : m_coeffs(coeffs)
+    template<typename NewType>
+    inline typename cast_selector<NewType,0>::return_type
+    cast() const
    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(CoeffsVectorType);
-      ei_assert(coeffs.size() > 0);
+      return cast_selector<NewType,0>::run(*this);
+    }
+
+    /** Assignment operator.
+      * The right-hand-side \a other must be either a vector representing the diagonal
+      * coefficients or a diagonal matrix expression.
+      */
+    template<typename OtherDerived>
+    inline Derived& operator=(const MatrixBase<OtherDerived>& other)
+    {
+      construct_from_expression<OtherDerived,OtherDerived::IsVectorAtCompileTime,ei_is_diagonal<OtherDerived>::ret>
+        ::run(derived(),other);
+      return derived();
    }

    inline int rows() const { return m_coeffs.size(); }
@@ -86,13 +122,153 @@ class DiagonalMatrix : ei_no_assignment_operator,
      return row == col ? m_coeffs.coeff(row) : static_cast<Scalar>(0);
    }

-    inline const CoeffsVectorType& diagonal() const { return m_coeffs; }
+    inline Scalar& coeffRef(int row, int col)
+    {
+      ei_assert(row==col);
+      return m_coeffs.coeffRef(row);
+    }
+
+    inline _CoeffsVectorType& diagonal() { return m_coeffs; }
+    inline const _CoeffsVectorType& diagonal() const { return m_coeffs; }

  protected:
-    const typename CoeffsVectorType::Nested m_coeffs;
+    CoeffsVectorType m_coeffs;
 };

-/** \nonstableyet 
+/** \class DiagonalMatrix
+  * \nonstableyet
+  *
+  * \brief Represent a diagonal matrix with its storage
+  *
+  * \param _Scalar the type of coefficients
+  * \param _Size the dimension of the matrix
+  *
+  * \sa class Matrix
+  */
+template<typename _Scalar,int _Size>
+struct ei_traits<DiagonalMatrix<_Scalar,_Size> > : ei_traits<Matrix<_Scalar,_Size,_Size> >
+{
+  enum {
+    Flags = (ei_traits<Matrix<_Scalar,_Size,_Size> >::Flags & HereditaryBits) | DiagonalBits
+  };
+};
+
+template<typename _Scalar, int _Size>
+class DiagonalMatrix
+  : public DiagonalMatrixBase<Matrix<_Scalar,_Size,1>, DiagonalMatrix<_Scalar,_Size> >
+{
+  public:
+    EIGEN_GENERIC_PUBLIC_INTERFACE(DiagonalMatrix)
+    typedef DiagonalMatrixBase<Matrix<_Scalar,_Size,1>, DiagonalMatrix<_Scalar,_Size> > DiagonalBase;
+
+  protected:
+    typedef Matrix<_Scalar,_Size,1> CoeffVectorType;
+    using DiagonalBase::m_coeffs;
+
+  public:
+
+    /** Default constructor without initialization */
+    inline DiagonalMatrix() : DiagonalBase()
+    {}
+
+    /** Constructs a diagonal matrix with given dimension  */
+    inline DiagonalMatrix(int dim) : DiagonalBase(dim)
+    {}
+
+    /** 2D only */
+    inline DiagonalMatrix(const Scalar& sx, const Scalar& sy)
+    {
+      EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(DiagonalMatrix,2,2);
+      m_coeffs.x() = sx;
+      m_coeffs.y() = sy;
+    }
+    /** 3D only */
+    inline DiagonalMatrix(const Scalar& sx, const Scalar& sy, const Scalar& sz)
+    {
+      EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(DiagonalMatrix,3,3);
+      m_coeffs.x() = sx;
+      m_coeffs.y() = sy;
+      m_coeffs.z() = sz;
+    }
+
+    /** copy constructor */
+    inline DiagonalMatrix(const DiagonalMatrix& other) : DiagonalBase(other.m_coeffs)
+    {}
+
+    /** generic constructor from expression */
+    template<typename OtherDerived>
+    explicit inline DiagonalMatrix(const MatrixBase<OtherDerived>& other) : DiagonalBase(other)
+    {}
+
+    DiagonalMatrix& operator=(const DiagonalMatrix& other)
+    {
+      m_coeffs = other.m_coeffs;
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    DiagonalMatrix& operator=(const MatrixBase<OtherDerived>& other)
+    {
+      EIGEN_STATIC_ASSERT(ei_is_diagonal<OtherDerived>::ret, THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX);
+      m_coeffs = other.diagonal();
+      return *this;
+    }
+    
+    inline void resize(int size)
+    {
+      m_coeffs.resize(size);
+    }
+    
+    inline void resize(int rows, int cols)
+    {
+      ei_assert(rows==cols && "a diagonal matrix must be square");
+      m_coeffs.resize(rows);
+    }
+    
+    inline void setZero() { m_coeffs.setZero(); }
+};
+
+/** \class DiagonalMatrixWrapper
+  * \nonstableyet
+  *
+  * \brief Expression of a diagonal matrix
+  *
+  * \param CoeffsVectorType the type of the vector of diagonal coefficients
+  *
+  * This class is an expression of a diagonal matrix with given vector of diagonal
+  * coefficients. It is the return type of MatrixBase::diagonal(const OtherDerived&)
+  * and most of the time this is the only way it is used.
+  *
+  * \sa class DiagonalMatrixBase, class DiagonalMatrix, MatrixBase::asDiagonal()
+  */
+template<typename CoeffsVectorType>
+struct ei_traits<DiagonalMatrixWrapper<CoeffsVectorType> >
+{
+  typedef typename CoeffsVectorType::Scalar Scalar;
+  typedef typename ei_nested<CoeffsVectorType>::type CoeffsVectorTypeNested;
+  typedef typename ei_unref<CoeffsVectorTypeNested>::type _CoeffsVectorTypeNested;
+  enum {
+    RowsAtCompileTime = CoeffsVectorType::SizeAtCompileTime,
+    ColsAtCompileTime = CoeffsVectorType::SizeAtCompileTime,
+    MaxRowsAtCompileTime = CoeffsVectorType::MaxSizeAtCompileTime,
+    MaxColsAtCompileTime = CoeffsVectorType::MaxSizeAtCompileTime,
+    Flags = (_CoeffsVectorTypeNested::Flags & HereditaryBits) | DiagonalBits,
+    CoeffReadCost = _CoeffsVectorTypeNested::CoeffReadCost
+  };
+};
+template<typename CoeffsVectorType>
+class DiagonalMatrixWrapper
+  : public DiagonalMatrixBase<typename CoeffsVectorType::Nested, DiagonalMatrixWrapper<CoeffsVectorType> >
+{
+    typedef typename CoeffsVectorType::Nested CoeffsVectorTypeNested;
+    typedef DiagonalMatrixBase<CoeffsVectorTypeNested, DiagonalMatrixWrapper<CoeffsVectorType> > DiagonalBase;
+  public:
+    EIGEN_GENERIC_PUBLIC_INTERFACE(DiagonalMatrixWrapper)
+    inline DiagonalMatrixWrapper(const CoeffsVectorType& coeffs) : DiagonalBase(coeffs)
+    {}
+};
+
+/** \nonstableyet
  * \returns an expression of a diagonal matrix with *this as vector of diagonal coefficients
  *
  * \only_for_vectors
@@ -105,13 +281,13 @@ class DiagonalMatrix : ei_no_assignment_operator,
  * \sa class DiagonalMatrix, isDiagonal()
  **/
 template<typename Derived>
-inline const DiagonalMatrix<Derived>
+inline const DiagonalMatrixWrapper<Derived>
 MatrixBase<Derived>::asDiagonal() const
 {
  return derived();
 }

-/** \nonstableyet 
+/** \nonstableyet
  * \returns true if *this is approximately equal to a diagonal matrix,
  *          within the precision given by \a prec.
  *
--- a/Eigen/src/Core/DiagonalProduct.h
+++ b/Eigen/src/Core/DiagonalProduct.h
@@ -30,9 +30,9 @@
 *  Unlike ei_nested, if the argument is a DiagonalMatrix and if it must be evaluated,
 *  then it evaluated to a DiagonalMatrix having its own argument evaluated.
 */
-template<typename T, int N> struct ei_nested_diagonal : ei_nested<T,N> {};
-template<typename T, int N> struct ei_nested_diagonal<DiagonalMatrix<T>,N >
- : ei_nested<DiagonalMatrix<T>, N, DiagonalMatrix<NestByValue<typename ei_plain_matrix_type<T>::type> > >
+template<typename T, int N, bool IsDiagonal = ei_is_diagonal<T>::ret> struct ei_nested_diagonal : ei_nested<T,N> {};
+template<typename T, int N> struct ei_nested_diagonal<T,N,true>
+ : ei_nested<T, N, DiagonalMatrix<typename T::Scalar, EIGEN_ENUM_MIN(T::RowsAtCompileTime,T::ColsAtCompileTime)> >
 {};

 // specialization of ProductReturnType
@@ -61,8 +61,8 @@ struct ei_traits<Product<LhsNested, RhsNested, DiagonalProduct> >
    MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
    MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,

-    LhsIsDiagonal = (_LhsNested::Flags&Diagonal)==Diagonal,
-    RhsIsDiagonal = (_RhsNested::Flags&Diagonal)==Diagonal,
+    LhsIsDiagonal = ei_is_diagonal<_LhsNested>::ret,
+    RhsIsDiagonal = ei_is_diagonal<_RhsNested>::ret,

    CanVectorizeRhs =  (!RhsIsDiagonal) && (RhsFlags & RowMajorBit) && (RhsFlags & PacketAccessBit)
                     && (ColsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
@@ -86,7 +86,7 @@ template<typename LhsNested, typename RhsNested> class Product<LhsNested, RhsNes
    typedef typename ei_traits<Product>::_RhsNested _RhsNested;

    enum {
-      RhsIsDiagonal = (_RhsNested::Flags&Diagonal)==Diagonal
+      RhsIsDiagonal = ei_is_diagonal<_RhsNested>::ret
    };

  public:
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -266,7 +266,10 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const

  ei_assert(size() == other.size());

-  return ei_dot_impl<Derived, OtherDerived>::run(derived(), other.derived());
+  // dot() must honor EvalBeforeNestingBit (eg: v.dot(M*v) )
+  typedef typename ei_cleantype<typename Derived::Nested>::type ThisNested;
+  typedef typename ei_cleantype<typename OtherDerived::Nested>::type OtherNested;
+  return ei_dot_impl<ThisNested, OtherNested>::run(derived(), other.derived());
 }

 /** \returns the squared \em l2 norm of *this, i.e., for vectors, the dot product of *this with itself.
@@ -289,6 +292,18 @@ inline typename NumTraits<typename ei_traits<Derived>::Scalar>::Real MatrixBase<
  return ei_sqrt(squaredNorm());
 }

+/** \returns the \em l2 norm of \c *this using a numerically more stable
+  * algorithm.
+  *
+  * \sa norm(), dot(), squaredNorm()
+  */
+template<typename Derived>
+inline typename NumTraits<typename ei_traits<Derived>::Scalar>::Real
+MatrixBase<Derived>::stableNorm() const
+{
+  return this->cwise().abs().redux(ei_scalar_hypot_op<RealScalar>());
+}
+
 /** \returns an expression of the quotient of *this by its own norm.
  *
  * \only_for_vectors
--- a/Eigen/src/Core/Functors.h
+++ b/Eigen/src/Core/Functors.h
@@ -37,6 +37,9 @@ template<typename Scalar> struct ei_scalar_sum_op EIGEN_EMPTY_STRUCT {
  template<typename PacketScalar>
  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
  { return ei_padd(a,b); }
+  template<typename PacketScalar>
+  EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const
+  { return ei_predux(a); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_sum_op<Scalar> > {
@@ -56,6 +59,9 @@ template<typename Scalar> struct ei_scalar_product_op EIGEN_EMPTY_STRUCT {
  template<typename PacketScalar>
  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
  { return ei_pmul(a,b); }
+  template<typename PacketScalar>
+  EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const
+  { return ei_predux_mul(a); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_product_op<Scalar> > {
@@ -75,6 +81,9 @@ template<typename Scalar> struct ei_scalar_min_op EIGEN_EMPTY_STRUCT {
  template<typename PacketScalar>
  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
  { return ei_pmin(a,b); }
+  template<typename PacketScalar>
+  EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const
+  { return ei_predux_min(a); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_min_op<Scalar> > {
@@ -94,6 +103,9 @@ template<typename Scalar> struct ei_scalar_max_op EIGEN_EMPTY_STRUCT {
  template<typename PacketScalar>
  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
  { return ei_pmax(a,b); }
+  template<typename PacketScalar>
+  EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const
+  { return ei_predux_max(a); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_max_op<Scalar> > {
@@ -103,6 +115,28 @@ struct ei_functor_traits<ei_scalar_max_op<Scalar> > {
  };
 };

+/** \internal
+  * \brief Template functor to compute the hypot of two scalars
+  *
+  * \sa MatrixBase::stableNorm(), class Redux
+  */
+template<typename Scalar> struct ei_scalar_hypot_op EIGEN_EMPTY_STRUCT {
+//   typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const
+  {
+//     typedef typename NumTraits<T>::Real RealScalar;
+//     RealScalar _x = ei_abs(x);
+//     RealScalar _y = ei_abs(y);
+    Scalar p = std::max(_x, _y);
+    Scalar q = std::min(_x, _y);
+    Scalar qp = q/p;
+    return p * ei_sqrt(Scalar(1) + qp*qp);
+  }
+};
+template<typename Scalar>
+struct ei_functor_traits<ei_scalar_hypot_op<Scalar> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess=0 };
+};

 // other binary functors:

@@ -156,10 +190,16 @@ struct ei_functor_traits<ei_scalar_quotient_op<Scalar> > {
  */
 template<typename Scalar> struct ei_scalar_opposite_op EIGEN_EMPTY_STRUCT {
  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return -a; }
+  template<typename PacketScalar>
+  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const
+  { return ei_pnegate(a); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_opposite_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false }; };
+{ enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = int(ei_packet_traits<Scalar>::size)>1 };
+};

 /** \internal
  * \brief Template functor to compute the absolute value of a scalar
@@ -169,13 +209,16 @@ struct ei_functor_traits<ei_scalar_opposite_op<Scalar> >
 template<typename Scalar> struct ei_scalar_abs_op EIGEN_EMPTY_STRUCT {
  typedef typename NumTraits<Scalar>::Real result_type;
  EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return ei_abs(a); }
+  template<typename PacketScalar>
+  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const
+  { return ei_pabs(a); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_abs_op<Scalar> >
 {
  enum {
    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = false // this could actually be vectorized with SSSE3.
+    PacketAccess = int(ei_packet_traits<Scalar>::size)>1
  };
 };

@@ -265,7 +308,7 @@ struct ei_functor_traits<ei_scalar_imag_op<Scalar> >
 * indeed it seems better to declare m_other as a PacketScalar and do the ei_pset1() once
 * in the constructor. However, in practice:
 *  - GCC does not like m_other as a PacketScalar and generate a load every time it needs it
- *  - one the other hand GCC is able to moves the ei_pset1() away the loop :)
+ *  - on the other hand GCC is able to moves the ei_pset1() away the loop :)
 *  - simpler code ;)
 * (ICC and gcc 4.4 seems to perform well in both cases, the issue is visible with y = a*x + b*y)
 */
@@ -284,6 +327,18 @@ template<typename Scalar>
 struct ei_functor_traits<ei_scalar_multiple_op<Scalar> >
 { enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = ei_packet_traits<Scalar>::size>1 }; };

+template<typename Scalar1, typename Scalar2>
+struct ei_scalar_multiple2_op {
+  typedef typename ei_scalar_product_traits<Scalar1,Scalar2>::ReturnType result_type;
+  EIGEN_STRONG_INLINE ei_scalar_multiple2_op(const ei_scalar_multiple2_op& other) : m_other(other.m_other) { }
+  EIGEN_STRONG_INLINE ei_scalar_multiple2_op(const Scalar2& other) : m_other(other) { }
+  EIGEN_STRONG_INLINE result_type operator() (const Scalar1& a) const { return a * m_other; }
+  const Scalar2 m_other;
+};
+template<typename Scalar1,typename Scalar2>
+struct ei_functor_traits<ei_scalar_multiple2_op<Scalar1,Scalar2> >
+{ enum { Cost = NumTraits<Scalar1>::MulCost, PacketAccess = false }; };
+
 template<typename Scalar, bool HasFloatingPoint>
 struct ei_scalar_quotient1_impl {
  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
@@ -324,6 +379,10 @@ struct ei_scalar_quotient1_op : ei_scalar_quotient1_impl<Scalar, NumTraits<Scala
  EIGEN_STRONG_INLINE ei_scalar_quotient1_op(const Scalar& other)
    : ei_scalar_quotient1_impl<Scalar, NumTraits<Scalar>::HasFloatingPoint >(other) {}
 };
+template<typename Scalar>
+struct ei_functor_traits<ei_scalar_quotient1_op<Scalar> > 
+: ei_functor_traits<ei_scalar_quotient1_impl<Scalar, NumTraits<Scalar>::HasFloatingPoint> >
+{};

 // nullary functors

--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -34,6 +34,47 @@
  * of generic vectorized code.
  */

+struct ei_default_packet_traits
+{
+  enum {
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasNegate = 1,
+    HasAbs    = 1,
+    HasMin    = 1,
+    HasMax    = 1,
+    
+    HasDiv    = 0,
+    HasSqrt   = 0,
+    HasExp    = 0,
+    HasLog    = 0,
+    HasPow    = 0,
+    
+    HasSin    = 0,
+    HasCos    = 0,
+    HasTan    = 0,
+    HasASin   = 0,
+    HasACos   = 0,
+    HasATan   = 0
+  };
+};
+
+template<typename T> struct ei_packet_traits : ei_default_packet_traits
+{
+  typedef T type;
+  enum {size=1};
+  enum {
+    HasAdd    = 0,
+    HasSub    = 0,
+    HasMul    = 0,
+    HasNegate = 0,
+    HasAbs    = 0,
+    HasMin    = 0,
+    HasMax    = 0
+  };
+};
+
 /** \internal \returns a + b (coeff-wise) */
 template<typename Packet> inline Packet
 ei_padd(const Packet& a,
@@ -44,6 +85,10 @@ template<typename Packet> inline Packet
 ei_psub(const Packet& a,
        const Packet& b) { return a-b; }

+/** \internal \returns -a (coeff-wise) */
+template<typename Packet> inline Packet
+ei_pnegate(const Packet& a) { return -a; }
+
 /** \internal \returns a * b (coeff-wise) */
 template<typename Packet> inline Packet
 ei_pmul(const Packet& a,
@@ -64,6 +109,26 @@ template<typename Packet> inline Packet
 ei_pmax(const Packet& a,
        const Packet& b) { return std::max(a, b); }

+/** \internal \returns the absolute value of \a a */
+template<typename Packet> inline Packet
+ei_pabs(const Packet& a) { return ei_abs(a); }
+
+/** \internal \returns the bitwise and of \a a and \a b */
+template<typename Packet> inline Packet
+ei_pand(const Packet& a, const Packet& b) { return a & b; }
+
+/** \internal \returns the bitwise or of \a a and \a b */
+template<typename Packet> inline Packet
+ei_por(const Packet& a, const Packet& b) { return a | b; }
+
+/** \internal \returns the bitwise xor of \a a and \a b */
+template<typename Packet> inline Packet
+ei_pxor(const Packet& a, const Packet& b) { return a ^ b; }
+
+/** \internal \returns the bitwise andnot of \a a and \a b */
+template<typename Packet> inline Packet
+ei_pandnot(const Packet& a, const Packet& b) { return a & (!b); }
+        
 /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
 template<typename Scalar> inline typename ei_packet_traits<Scalar>::type
 ei_pload(const Scalar* from) { return *from; }
@@ -96,6 +161,40 @@ ei_preduxp(const Packet* vecs) { return vecs[0]; }
 template<typename Packet> inline typename ei_unpacket_traits<Packet>::type ei_predux(const Packet& a)
 { return a; }

+/** \internal \returns the product of the elements of \a a*/
+template<typename Packet> inline typename ei_unpacket_traits<Packet>::type ei_predux_mul(const Packet& a)
+{ return a; }
+
+/** \internal \returns the min of the elements of \a a*/
+template<typename Packet> inline typename ei_unpacket_traits<Packet>::type ei_predux_min(const Packet& a)
+{ return a; }
+
+/** \internal \returns the max of the elements of \a a*/
+template<typename Packet> inline typename ei_unpacket_traits<Packet>::type ei_predux_max(const Packet& a)
+{ return a; }
+
+/** \internal \returns the reversed elements of \a a*/
+template<typename Packet> inline Packet ei_preverse(const Packet& a)
+{ return a; }
+
+/**************************
+* Special math functions
+***************************/
+
+/** \internal \returns the sin of \a a (coeff-wise) */
+template<typename Packet> inline static Packet ei_psin(Packet a) { return ei_sin(a); }
+
+/** \internal \returns the cos of \a a (coeff-wise) */
+template<typename Packet> inline static Packet ei_pcos(Packet a) { return ei_cos(a); }
+
+/** \internal \returns the exp of \a a (coeff-wise) */
+template<typename Packet> inline static Packet ei_pexp(Packet a) { return ei_exp(a); }
+
+/** \internal \returns the log of \a a (coeff-wise) */
+template<typename Packet> inline static Packet ei_plog(Packet a) { return ei_log(a); }
+
+/** \internal \returns the square-root of \a a (coeff-wise) */
+template<typename Packet> inline static Packet ei_psqrt(Packet a) { return ei_sqrt(a); }

 /***************************************************************************
 * The following functions might not have to be overwritten for vectorized types
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -40,6 +40,12 @@
  * It can be used to let Eigen interface without any overhead with non-Eigen data structures,
  * such as plain C arrays or structures from other libraries.
  *
+  * \b Tips: to change the array of data mapped by a Map object, you can use the C++
+  * placement new syntax:
+  *
+  * Example: \include Map_placement_new.cpp
+  * Output: \verbinclude Map_placement_new.out
+  *
  * This class is the return type of Matrix::Map() but can also be used directly.
  *
  * \sa Matrix::Map()
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -96,7 +96,7 @@ template<typename Derived> class MapBase
        return const_cast<Scalar*>(m_data)[row + col * stride()];
    }

-    inline const Scalar coeff(int index) const
+    inline const Scalar& coeff(int index) const
    {
      ei_assert(Derived::IsVectorAtCompileTime || (ei_traits<Derived>::Flags & LinearAccessBit));
      if ( ((RowsAtCompileTime == 1) == IsRowMajor) )
@@ -176,7 +176,7 @@ template<typename Derived> class MapBase
    {
      return Base::operator=(other);
    }
-    
+
    using Base::operator*=;

    template<typename OtherDerived>
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -26,6 +26,7 @@
 #define EIGEN_MATHFUNCTIONS_H

 template<typename T> inline typename NumTraits<T>::Real precision();
+template<typename T> inline typename NumTraits<T>::Real machine_epsilon();
 template<typename T> inline T ei_random(T a, T b);
 template<typename T> inline T ei_random();
 template<typename T> inline T ei_random_amplitude()
@@ -34,10 +35,11 @@ template<typename T> inline T ei_random_amplitude()
  else return static_cast<T>(10);
 }

-template<typename T> inline T ei_hypot(T x, T y)
+template<typename T> inline typename NumTraits<T>::Real ei_hypot(T x, T y)
 {
-  T _x = ei_abs(x);
-  T _y = ei_abs(y);
+  typedef typename NumTraits<T>::Real RealScalar;
+  RealScalar _x = ei_abs(x);
+  RealScalar _y = ei_abs(y);
  T p = std::max(_x, _y);
  T q = std::min(_x, _y);
  T qp = q/p;
@@ -49,6 +51,7 @@ template<typename T> inline T ei_hypot(T x, T y)
 **************/

 template<> inline int precision<int>() { return 0; }
+template<> inline int machine_epsilon<int>() { return 0; }
 inline int ei_real(int x)  { return x; }
 inline int ei_imag(int)    { return 0; }
 inline int ei_conj(int x)  { return x; }
@@ -59,12 +62,20 @@ inline int ei_exp(int)  { ei_assert(false); return 0; }
 inline int ei_log(int)  { ei_assert(false); return 0; }
 inline int ei_sin(int)  { ei_assert(false); return 0; }
 inline int ei_cos(int)  { ei_assert(false); return 0; }
-
-#if EIGEN_GNUC_AT_LEAST(4,3)
-inline int ei_pow(int x, int y) { return int(std::pow(x, y)); }
-#else
-inline int ei_pow(int x, int y) { return int(std::pow(double(x), y)); }
-#endif
+inline int ei_pow(int x, int y)
+{
+  int res = 1;
+  if(y < 0) return 0;
+  if(y & 1) res *= x;
+  y >>= 1;
+  while(y)
+  {
+    x *= x;
+    if(y&1) res *= x;
+    y >>= 1;
+  }
+  return res;
+}

 template<> inline int ei_random(int a, int b)
 {
@@ -93,6 +104,7 @@ inline bool ei_isApproxOrLessThan(int a, int b, int = precision<int>())
 **************/

 template<> inline float precision<float>() { return 1e-5f; }
+template<> inline float machine_epsilon<float>() { return 1.192e-07f; }
 inline float ei_real(float x)  { return x; }
 inline float ei_imag(float)    { return 0.f; }
 inline float ei_conj(float x)  { return x; }
@@ -138,6 +150,8 @@ inline bool ei_isApproxOrLessThan(float a, float b, float prec = precision<float
 **************/

 template<> inline double precision<double>() { return 1e-11; }
+template<> inline double machine_epsilon<double>() { return 2.220e-16; }
+
 inline double ei_real(double x)  { return x; }
 inline double ei_imag(double)    { return 0.; }
 inline double ei_conj(double x)  { return x; }
@@ -183,6 +197,7 @@ inline bool ei_isApproxOrLessThan(double a, double b, double prec = precision<do
 *********************/

 template<> inline float precision<std::complex<float> >() { return precision<float>(); }
+template<> inline float machine_epsilon<std::complex<float> >() { return machine_epsilon<float>(); }
 inline float ei_real(const std::complex<float>& x) { return std::real(x); }
 inline float ei_imag(const std::complex<float>& x) { return std::imag(x); }
 inline std::complex<float> ei_conj(const std::complex<float>& x) { return std::conj(x); }
@@ -216,6 +231,7 @@ inline bool ei_isApprox(const std::complex<float>& a, const std::complex<float>&
 **********************/

 template<> inline double precision<std::complex<double> >() { return precision<double>(); }
+template<> inline double machine_epsilon<std::complex<double> >() { return machine_epsilon<double>(); }
 inline double ei_real(const std::complex<double>& x) { return std::real(x); }
 inline double ei_imag(const std::complex<double>& x) { return std::imag(x); }
 inline std::complex<double> ei_conj(const std::complex<double>& x) { return std::conj(x); }
@@ -250,6 +266,7 @@ inline bool ei_isApprox(const std::complex<double>& a, const std::complex<double
 ******************/

 template<> inline long double precision<long double>() { return precision<double>(); }
+template<> inline long double machine_epsilon<long double>() { return 1.084e-19l; }
 inline long double ei_real(long double x)  { return x; }
 inline long double ei_imag(long double)    { return 0.; }
 inline long double ei_conj(long double x)  { return x; }
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -134,7 +134,7 @@ class Matrix
    ei_matrix_storage<Scalar, MaxSizeAtCompileTime, RowsAtCompileTime, ColsAtCompileTime, Options> m_storage;

  public:
-    enum { NeedsToAlign = (Options&AutoAlign) == AutoAlign
+    enum { NeedsToAlign = (!(Options&DontAlign))
                          && SizeAtCompileTime!=Dynamic && ((sizeof(Scalar)*SizeAtCompileTime)%16)==0 };
    EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)

@@ -246,6 +246,29 @@ class Matrix
        m_storage.resize(size, size, 1);
    }

+    /** Resizes *this to have the same dimensions as \a other.
+      * Takes care of doing all the checking that's needed.
+      *
+      * Note that copying a row-vector into a vector (and conversely) is allowed.
+      * The resizing, if any, is then done in the appropriate way so that row-vectors
+      * remain row-vectors and vectors remain vectors.
+      */
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE void resizeLike(const MatrixBase<OtherDerived>& other)
+    {
+      if(RowsAtCompileTime == 1)
+      {
+        ei_assert(other.isVector());
+        resize(1, other.size());
+      }
+      else if(ColsAtCompileTime == 1)
+      {
+        ei_assert(other.isVector());
+        resize(other.size(), 1);
+      }
+      else resize(other.rows(), other.cols());
+    }
+
    /** Copies the value of the expression \a other into \c *this with automatic resizing.
      *
      * *this might be resized to match the dimensions of \a other. If *this was a null matrix (not already initialized),
@@ -266,9 +289,13 @@ class Matrix
      */
    EIGEN_STRONG_INLINE Matrix& operator=(const Matrix& other)
    {
-      return _set(other);
+      return  _set(other);
    }

+    template<typename OtherDerived,typename OtherEvalType>
+    EIGEN_STRONG_INLINE Matrix& operator=(const ReturnByValue<OtherDerived,OtherEvalType>& func)
+    { return Base::operator=(func); }
+
    EIGEN_INHERIT_ASSIGNMENT_OPERATOR(Matrix, +=)
    EIGEN_INHERIT_ASSIGNMENT_OPERATOR(Matrix, -=)
    EIGEN_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Matrix, *=)
@@ -311,47 +338,24 @@ class Matrix
      ei_assert(SizeAtCompileTime == Dynamic || SizeAtCompileTime == dim);
    }

-    /** This constructor has two very different behaviors, depending on the type of *this.
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename T0, typename T1>
+    EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y)
+    {
+      _check_template_params();
+      _init2<T0,T1>(x, y);
+    }
+    #else
+    /** constructs an uninitialized matrix with \a rows rows and \a cols columns.
      *
-      * \li When Matrix is a fixed-size vector type of size 2, this constructor constructs
-      *     an initialized vector. The parameters \a x, \a y are copied into the first and second
-      *     coords of the vector respectively.
-      * \li Otherwise, this constructor constructs an uninitialized matrix with \a x rows and
-      *     \a y columns. This is useful for dynamic-size matrices. For fixed-size matrices,
-      *     it is redundant to pass these parameters, so one should use the default constructor
-      *     Matrix() instead.
-      */
-    EIGEN_STRONG_INLINE Matrix(int x, int y) : m_storage(x*y, x, y)
-    {
-      _check_template_params();
-      if((RowsAtCompileTime == 1 && ColsAtCompileTime == 2)
-      || (RowsAtCompileTime == 2 && ColsAtCompileTime == 1))
-      {
-        m_storage.data()[0] = Scalar(x);
-        m_storage.data()[1] = Scalar(y);
-      }
-      else
-      {
-        ei_assert(x > 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == x)
-               && y > 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == y));
-      }
-    }
+      * This is useful for dynamic-size matrices. For fixed-size matrices,
+      * it is redundant to pass these parameters, so one should use the default constructor
+      * Matrix() instead. */
+    Matrix(int rows, int cols);
    /** constructs an initialized 2D vector with given coefficients */
-    EIGEN_STRONG_INLINE Matrix(const float& x, const float& y)
-    {
-      _check_template_params();
-      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Matrix, 2)
-      m_storage.data()[0] = x;
-      m_storage.data()[1] = y;
-    }
-    /** constructs an initialized 2D vector with given coefficients */
-    EIGEN_STRONG_INLINE Matrix(const double& x, const double& y)
-    {
-      _check_template_params();
-      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Matrix, 2)
-      m_storage.data()[0] = x;
-      m_storage.data()[1] = y;
-    }
+    Matrix(const Scalar& x, const Scalar& y);
+    #endif
+
    /** constructs an initialized 3D vector with given coefficients */
    EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z)
    {
@@ -389,6 +393,10 @@ class Matrix
      _check_template_params();
      _set_noalias(other);
    }
+    /** Copy constructor with in-place evaluation */
+    template<typename OtherDerived,typename OtherEvalType>
+    EIGEN_STRONG_INLINE Matrix(const ReturnByValue<OtherDerived,OtherEvalType>& other)
+    { other.evalTo(*this); }
    /** Destructor */
    inline ~Matrix() {}

@@ -480,17 +488,12 @@ class Matrix
    template<typename OtherDerived>
    EIGEN_STRONG_INLINE void _resize_to_match(const MatrixBase<OtherDerived>& other)
    {
-      if(RowsAtCompileTime == 1)
-      {
-        ei_assert(other.isVector());
-        resize(1, other.size());
-      }
-      else if(ColsAtCompileTime == 1)
-      {
-        ei_assert(other.isVector());
-        resize(other.size(), 1);
-      }
-      else resize(other.rows(), other.cols());
+      #ifdef EIGEN_NO_AUTOMATIC_RESIZING
+      ei_assert((this->size()==0 || (IsVectorAtCompileTime ? (this->size() == other.size())
+                 : (rows() == other.rows() && cols() == other.cols())))
+        && "Size mismatch. Automatic resizing is disabled because EIGEN_NO_AUTOMATIC_RESIZING is defined");
+      #endif
+      resizeLike(other);
    }

    /** \internal Copies the value of the expression \a other into \c *this with automatic resizing.
@@ -527,13 +530,28 @@ class Matrix

    static EIGEN_STRONG_INLINE void _check_template_params()
    {
-        EIGEN_STATIC_ASSERT((_Rows > 0
-                        && _Cols > 0
-                        && _MaxRows <= _Rows
-                        && _MaxCols <= _Cols
-                        && (_Options & (AutoAlign|RowMajor)) == _Options),
+        EIGEN_STATIC_ASSERT(((_MaxRows >= _Rows || _Rows==Dynamic)
+                          && (_MaxCols >= _Cols || _Cols==Dynamic)
+                          && ((_MaxRows==Dynamic?1:_MaxRows)*(_MaxCols==Dynamic?1:_MaxCols)<Dynamic)
+                          && (_Options & (DontAlign|RowMajor)) == _Options),
          INVALID_MATRIX_TEMPLATE_PARAMETERS)
    }
+
+
+    template<typename T0, typename T1>
+    EIGEN_STRONG_INLINE void _init2(int rows, int cols, typename ei_enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
+    {
+      ei_assert(rows > 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
+             && cols > 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols));
+      m_storage.resize(rows*cols,rows,cols);
+    }
+    template<typename T0, typename T1>
+    EIGEN_STRONG_INLINE void _init2(const Scalar& x, const Scalar& y, typename ei_enable_if<Base::SizeAtCompileTime==2,T0>::type* = 0)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Matrix, 2)
+      m_storage.data()[0] = x;
+      m_storage.data()[1] = y;
+    }
 };

 /** \defgroup matrixtypedefs Global matrix typedefs
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -34,7 +34,7 @@
  * types. Most of the Eigen API is contained in this class. Other important classes for
  * the Eigen API are Matrix, Cwise, and PartialRedux.
  *
-  * Note that some methods are defined in the \ref Array module.
+  * Note that some methods are defined in the \ref Array_Module array module.
  *
  * \param Derived is the derived type, e.g. a matrix type, or an expression, etc.
  *
@@ -53,10 +53,17 @@
  *
  */
 template<typename Derived> class MatrixBase
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  : public ei_special_scalar_op_base<Derived,typename ei_traits<Derived>::Scalar,
+                typename NumTraits<typename ei_traits<Derived>::Scalar>::Real>
+#endif // not EIGEN_PARSED_BY_DOXYGEN
 {
  public:

 #ifndef EIGEN_PARSED_BY_DOXYGEN
+    using ei_special_scalar_op_base<Derived,typename ei_traits<Derived>::Scalar,
+                typename NumTraits<typename ei_traits<Derived>::Scalar>::Real>::operator*;
+
    class InnerIterator;

    typedef typename ei_traits<Derived>::Scalar Scalar;
@@ -160,12 +167,15 @@ template<typename Derived> class MatrixBase
    inline int rows() const { return derived().rows(); }
    /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
    inline int cols() const { return derived().cols(); }
-    /** \returns the number of coefficients, which is \a rows()*cols().
+    /** \returns the number of coefficients, which is rows()*cols().
      * \sa rows(), cols(), SizeAtCompileTime. */
    inline int size() const { return rows() * cols(); }
+    /** \returns the size of the main diagonal, which is min(rows(),cols()).
+      * \sa rows(), cols(), SizeAtCompileTime. */
+    inline int diagonalSize() const { return std::min(rows(),cols()); }
    /** \returns the number of nonzero coefficients which is in practice the number
      * of stored coefficients. */
-    inline int nonZeros() const { return derived.nonZeros(); }
+    inline int nonZeros() const { return derived().nonZeros(); }
    /** \returns true if either the number of rows or the number of columns is equal to 1.
      * In other words, this function returns
      * \code rows()==1 || cols()==1 \endcode
@@ -192,6 +202,10 @@ template<typename Derived> class MatrixBase
      */
    typedef typename ei_plain_matrix_type<Derived>::type PlainMatrixType_ColMajor;

+    /** \internal the return type of coeff()
+      */
+    typedef typename ei_meta_if<bool(int(Flags)&DirectAccessBit), const Scalar&, Scalar>::ret CoeffReturnType;
+
    /** \internal Represents a matrix with all coefficients equal to one another*/
    typedef CwiseNullaryOp<ei_scalar_constant_op<Scalar>,Derived> ConstantReturnType;
    /** \internal Represents a scalar multiple of a matrix */
@@ -257,15 +271,15 @@ template<typename Derived> class MatrixBase
    template<typename OtherDerived>
    CommaInitializer<Derived> operator<< (const MatrixBase<OtherDerived>& other);

-    const Scalar coeff(int row, int col) const;
-    const Scalar operator()(int row, int col) const;
+    const CoeffReturnType coeff(int row, int col) const;
+    const CoeffReturnType operator()(int row, int col) const;

    Scalar& coeffRef(int row, int col);
    Scalar& operator()(int row, int col);

-    const Scalar coeff(int index) const;
-    const Scalar operator[](int index) const;
-    const Scalar operator()(int index) const;
+    const CoeffReturnType coeff(int index) const;
+    const CoeffReturnType operator[](int index) const;
+    const CoeffReturnType operator()(int index) const;

    Scalar& coeffRef(int index);
    Scalar& operator[](int index);
@@ -292,10 +306,10 @@ template<typename Derived> class MatrixBase
    template<int StoreMode>
    void writePacket(int index, const PacketScalar& x);

-    const Scalar x() const;
-    const Scalar y() const;
-    const Scalar z() const;
-    const Scalar w() const;
+    const CoeffReturnType x() const;
+    const CoeffReturnType y() const;
+    const CoeffReturnType z() const;
+    const CoeffReturnType w() const;
    Scalar& x();
    Scalar& y();
    Scalar& z();
@@ -324,6 +338,9 @@ template<typename Derived> class MatrixBase
    Derived& operator/=(const Scalar& other);

    const ScalarMultipleReturnType operator*(const Scalar& scalar) const;
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    const ScalarMultipleReturnType operator*(const RealScalar& scalar) const;
+    #endif
    const CwiseUnaryOp<ei_scalar_quotient1_op<typename ei_traits<Derived>::Scalar>, Derived>
    operator/(const Scalar& scalar) const;

@@ -351,6 +368,7 @@ template<typename Derived> class MatrixBase
    Scalar dot(const MatrixBase<OtherDerived>& other) const;
    RealScalar squaredNorm() const;
    RealScalar norm()  const;
+    RealScalar stableNorm()  const;
    const PlainMatrixType normalized() const;
    void normalize();

@@ -358,7 +376,7 @@ template<typename Derived> class MatrixBase
    const Eigen::Transpose<Derived> transpose() const;
    void transposeInPlace();
    const AdjointReturnType adjoint() const;
-
+    void adjointInPlace();

    RowXpr row(int i);
    const RowXpr row(int i) const;
@@ -404,9 +422,15 @@ template<typename Derived> class MatrixBase
    template<int Size> typename BlockReturnType<Derived,Size>::SubVectorType segment(int start);
    template<int Size> const typename BlockReturnType<Derived,Size>::SubVectorType segment(int start) const;

-    DiagonalCoeffs<Derived> diagonal();
-    const DiagonalCoeffs<Derived> diagonal() const;
+    Diagonal<Derived,0> diagonal();
+    const Diagonal<Derived,0> diagonal() const;

+    template<int Index> Diagonal<Derived,Index> diagonal();
+    template<int Index> const Diagonal<Derived,Index> diagonal() const;
+    
+    Diagonal<Derived, Dynamic> diagonal(int index);
+    const Diagonal<Derived, Dynamic> diagonal(int index) const;
+    
    template<unsigned int Mode> Part<Derived, Mode> part();
    template<unsigned int Mode> const Part<Derived, Mode> part() const;

@@ -443,7 +467,7 @@ template<typename Derived> class MatrixBase
    static const BasisReturnType UnitZ();
    static const BasisReturnType UnitW();

-    const DiagonalMatrix<Derived> asDiagonal() const;
+    const DiagonalMatrixWrapper<Derived> asDiagonal() const;

    void fill(const Scalar& value);
    Derived& setConstant(const Scalar& value);
@@ -487,8 +511,12 @@ template<typename Derived> class MatrixBase


    template<typename NewType>
-    const CwiseUnaryOp<ei_scalar_cast_op<typename ei_traits<Derived>::Scalar, NewType>, Derived> cast() const;
-
+    typename ei_cast_return_type<
+        Derived,
+        const CwiseUnaryOp<ei_scalar_cast_op<typename ei_traits<Derived>::Scalar, NewType>, Derived>
+      >::type
+    cast() const;
+           
    /** \returns the matrix or vector obtained by evaluating this expression.
      *
      * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
@@ -529,6 +557,8 @@ template<typename Derived> class MatrixBase
    Scalar sum() const;
    Scalar trace() const;

+    Scalar prod() const;
+
    typename ei_traits<Derived>::Scalar minCoeff() const;
    typename ei_traits<Derived>::Scalar maxCoeff() const;

@@ -582,9 +612,18 @@ template<typename Derived> class MatrixBase

    template<int p> RealScalar lpNorm() const;

+    template<int RowFactor, int ColFactor>
+    const Replicate<Derived,RowFactor,ColFactor> replicate() const;
+    const Replicate<Derived,Dynamic,Dynamic> replicate(int rowFacor,int colFactor) const;
+
+    Eigen::Reverse<Derived, BothDirections> reverse();
+    const Eigen::Reverse<Derived, BothDirections> reverse() const;
+    void reverseInPlace();
+
 /////////// LU module ///////////

    const LU<PlainMatrixType> lu() const;
+    const PartialLU<PlainMatrixType> partialLu() const;    
    const PlainMatrixType inverse() const;
    void computeInverse(PlainMatrixType *result) const;
    Scalar determinant() const;
@@ -609,8 +648,23 @@ template<typename Derived> class MatrixBase

    template<typename OtherDerived>
    PlainMatrixType cross(const MatrixBase<OtherDerived>& other) const;
+    template<typename OtherDerived>
+    PlainMatrixType cross3(const MatrixBase<OtherDerived>& other) const;
    PlainMatrixType unitOrthogonal(void) const;
    Matrix<Scalar,3,1> eulerAngles(int a0, int a1, int a2) const;
+    const ScalarMultipleReturnType operator*(const UniformScaling<Scalar>& s) const;
+    enum {
+      SizeMinusOne = SizeAtCompileTime==Dynamic ? Dynamic : SizeAtCompileTime-1
+    };
+    typedef Block<Derived,
+                  ei_traits<Derived>::ColsAtCompileTime==1 ? SizeMinusOne : 1,
+                  ei_traits<Derived>::ColsAtCompileTime==1 ? 1 : SizeMinusOne> StartMinusOne;
+    typedef CwiseUnaryOp<ei_scalar_quotient1_op<typename ei_traits<Derived>::Scalar>,
+                NestByValue<StartMinusOne> > HNormalizedReturnType;
+
+    const HNormalizedReturnType hnormalized() const;
+    typedef Homogeneous<Derived,MatrixBase<Derived>::ColsAtCompileTime==1?Vertical:Horizontal> HomogeneousReturnType;
+    const HomogeneousReturnType homogeneous() const;

 /////////// Sparse module ///////////

@@ -621,6 +675,9 @@ template<typename Derived> class MatrixBase
    template<typename Derived1, typename Derived2>
    Derived& lazyAssign(const SparseProduct<Derived1,Derived2,DenseTimeSparseProduct>& product);

+    template<typename OtherDerived,typename OtherEvalType>
+    Derived& operator=(const ReturnByValue<OtherDerived,OtherEvalType>& func);
+
    #ifdef EIGEN_MATRIXBASE_PLUGIN
    #include EIGEN_MATRIXBASE_PLUGIN
    #endif
--- a/Eigen/src/Core/MatrixStorage.h
+++ b/Eigen/src/Core/MatrixStorage.h
@@ -32,7 +32,7 @@ struct ei_constructor_without_unaligned_array_assert {};
  * Static array automatically aligned if the total byte size is a multiple of 16 and the matrix options require auto alignment
  */
 template <typename T, int Size, int MatrixOptions,
-          bool Align = (MatrixOptions&AutoAlign) && (((Size*sizeof(T))&0xf)==0)
+          bool Align = (!(MatrixOptions&DontAlign)) && (((Size*sizeof(T))&0xf)==0)
 > struct ei_matrix_array
 {
  EIGEN_ALIGN_128 T array[Size];
@@ -85,6 +85,21 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class ei_matr
    inline T *data() { return m_data.array; }
 };

+// null matrix
+template<typename T, int _Rows, int _Cols, int _Options> class ei_matrix_storage<T, 0, _Rows, _Cols, _Options>
+{
+  public:
+    inline explicit ei_matrix_storage() {}
+    inline ei_matrix_storage(ei_constructor_without_unaligned_array_assert) {}
+    inline ei_matrix_storage(int,int,int) {}
+    inline void swap(ei_matrix_storage& other) {}
+    inline static int rows(void) {return _Rows;}
+    inline static int cols(void) {return _Cols;}
+    inline void resize(int,int,int) {}
+    inline const T *data() const { return 0; }
+    inline T *data() { return 0; }
+};
+
 // dynamic-size matrix with fixed-size storage
 template<typename T, int Size, int _Options> class ei_matrix_storage<T, Size, Dynamic, Dynamic, _Options>
 {
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -54,7 +54,7 @@ template<typename ExpressionType> class NestByValue
    inline int cols() const { return m_expression.cols(); }
    inline int stride() const { return m_expression.stride(); }

-    inline const Scalar coeff(int row, int col) const
+    inline const CoeffReturnType coeff(int row, int col) const
    {
      return m_expression.coeff(row, col);
    }
@@ -64,7 +64,7 @@ template<typename ExpressionType> class NestByValue
      return m_expression.const_cast_derived().coeffRef(row, col);
    }

-    inline const Scalar coeff(int index) const
+    inline const CoeffReturnType coeff(int index) const
    {
      return m_expression.coeff(index);
    }
@@ -97,6 +97,8 @@ template<typename ExpressionType> class NestByValue
    {
      m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
    }
+    
+    operator const ExpressionType&() const { return m_expression; }

  protected:
    const ExpressionType m_expression;
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -84,7 +84,7 @@ template<typename Lhs, typename Rhs> struct ei_product_mode
 {
  enum{

-    value = ((Rhs::Flags&Diagonal)==Diagonal) || ((Lhs::Flags&Diagonal)==Diagonal)
+    value = ei_is_diagonal<Rhs>::ret || ei_is_diagonal<Lhs>::ret
          ? DiagonalProduct
          : Lhs::MaxColsAtCompileTime == Dynamic
            && ( Lhs::MaxRowsAtCompileTime == Dynamic
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -26,48 +26,155 @@
 #ifndef EIGEN_REDUX_H
 #define EIGEN_REDUX_H

-template<typename BinaryOp, typename Derived, int Start, int Length>
-struct ei_redux_impl
+// TODO
+//  * implement other kind of vectorization
+//  * factorize code
+
+/***************************************************************************
+* Part 1 : the logic deciding a strategy for vectorization and unrolling
+***************************************************************************/
+
+template<typename Func, typename Derived>
+struct ei_redux_traits
+{
+private:
+  enum {
+    PacketSize = ei_packet_traits<typename Derived::Scalar>::size,
+    InnerMaxSize = int(Derived::Flags)&RowMajorBit
+                 ? Derived::MaxColsAtCompileTime
+                 : Derived::MaxRowsAtCompileTime
+  };
+
+  enum {
+    MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit)
+                  && (ei_functor_traits<Func>::PacketAccess),
+    MayLinearVectorize = MightVectorize && (int(Derived::Flags)&LinearAccessBit),
+    MaySliceVectorize  = MightVectorize && int(InnerMaxSize)>=3*PacketSize 
+  };
+
+public:
+  enum {
+    Vectorization = int(MayLinearVectorize) ? int(LinearVectorization)
+                  : int(MaySliceVectorize)  ? int(SliceVectorization)
+                                            : int(NoVectorization)
+  };
+  
+private:
+  enum {
+    Cost = Derived::SizeAtCompileTime * Derived::CoeffReadCost
+           + (Derived::SizeAtCompileTime-1) * NumTraits<typename Derived::Scalar>::AddCost,
+    UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Vectorization) == int(NoVectorization) ? 1 : int(PacketSize))
+  };
+
+public:
+  enum {
+    Unrolling = Cost <= UnrollingLimit
+              ? CompleteUnrolling
+              : NoUnrolling
+  };
+};
+
+/***************************************************************************
+* Part 2 : unrollers
+***************************************************************************/
+
+/*** no vectorization ***/
+
+template<typename Func, typename Derived, int Start, int Length>
+struct ei_redux_novec_unroller
 {
  enum {
    HalfLength = Length/2
  };

-  typedef typename ei_result_of<BinaryOp(typename Derived::Scalar)>::type Scalar;
+  typedef typename Derived::Scalar Scalar;

-  static Scalar run(const Derived &mat, const BinaryOp& func)
+  EIGEN_STRONG_INLINE static Scalar run(const Derived &mat, const Func& func)
  {
-    return func(
-      ei_redux_impl<BinaryOp, Derived, Start, HalfLength>::run(mat, func),
-      ei_redux_impl<BinaryOp, Derived, Start+HalfLength, Length - HalfLength>::run(mat, func));
+    return func(ei_redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
+                ei_redux_novec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func));
  }
 };

-template<typename BinaryOp, typename Derived, int Start>
-struct ei_redux_impl<BinaryOp, Derived, Start, 1>
+template<typename Func, typename Derived, int Start>
+struct ei_redux_novec_unroller<Func, Derived, Start, 1>
 {
  enum {
    col = Start / Derived::RowsAtCompileTime,
    row = Start % Derived::RowsAtCompileTime
  };

-  typedef typename ei_result_of<BinaryOp(typename Derived::Scalar)>::type Scalar;
+  typedef typename Derived::Scalar Scalar;

-  static Scalar run(const Derived &mat, const BinaryOp &)
+  EIGEN_STRONG_INLINE static Scalar run(const Derived &mat, const Func&)
  {
    return mat.coeff(row, col);
  }
 };

-template<typename BinaryOp, typename Derived, int Start>
-struct ei_redux_impl<BinaryOp, Derived, Start, Dynamic>
+/*** vectorization ***/
+  
+template<typename Func, typename Derived, int Start, int Length>
+struct ei_redux_vec_unroller
 {
-  typedef typename ei_result_of<BinaryOp(typename Derived::Scalar)>::type Scalar;
-  static Scalar run(const Derived& mat, const BinaryOp& func)
+  enum {
+    PacketSize = ei_packet_traits<typename Derived::Scalar>::size,
+    HalfLength = Length/2
+  };
+
+  typedef typename Derived::Scalar Scalar;
+  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+
+  EIGEN_STRONG_INLINE static PacketScalar run(const Derived &mat, const Func& func)
+  {
+    return func.packetOp(
+            ei_redux_vec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
+            ei_redux_vec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func) );
+  }
+};
+
+template<typename Func, typename Derived, int Start>
+struct ei_redux_vec_unroller<Func, Derived, Start, 1>
+{
+  enum {
+    index = Start * ei_packet_traits<typename Derived::Scalar>::size,
+    row = int(Derived::Flags)&RowMajorBit
+        ? index / int(Derived::ColsAtCompileTime)
+        : index % Derived::RowsAtCompileTime,
+    col = int(Derived::Flags)&RowMajorBit
+        ? index % int(Derived::ColsAtCompileTime)
+        : index / Derived::RowsAtCompileTime,
+    alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned
+  };
+
+  typedef typename Derived::Scalar Scalar;
+  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+
+  EIGEN_STRONG_INLINE static PacketScalar run(const Derived &mat, const Func&)
+  {
+    return mat.template packet<alignment>(row, col);
+  }
+};
+
+/***************************************************************************
+* Part 3 : implementation of all cases
+***************************************************************************/
+
+template<typename Func, typename Derived,
+         int Vectorization = ei_redux_traits<Func, Derived>::Vectorization,
+         int Unrolling = ei_redux_traits<Func, Derived>::Unrolling
+>
+struct ei_redux_impl;
+
+template<typename Func, typename Derived>
+struct ei_redux_impl<Func, Derived, NoVectorization, NoUnrolling>
+{
+  typedef typename Derived::Scalar Scalar;
+  static Scalar run(const Derived& mat, const Func& func)
  {
    ei_assert(mat.rows()>0 && mat.cols()>0 && "you are using a non initialized matrix");
    Scalar res;
-    res = mat.coeff(0,0);
+    res = mat.coeff(0, 0);
    for(int i = 1; i < mat.rows(); ++i)
      res = func(res, mat.coeff(i, 0));
    for(int j = 1; j < mat.cols(); ++j)
@@ -77,6 +184,116 @@ struct ei_redux_impl<BinaryOp, Derived, Start, Dynamic>
  }
 };

+template<typename Func, typename Derived>
+struct ei_redux_impl<Func,Derived, NoVectorization, CompleteUnrolling>
+  : public ei_redux_novec_unroller<Func,Derived, 0, Derived::SizeAtCompileTime>
+{};
+
+template<typename Func, typename Derived>
+struct ei_redux_impl<Func, Derived, LinearVectorization, NoUnrolling>
+{
+  typedef typename Derived::Scalar Scalar;
+  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+
+  static Scalar run(const Derived& mat, const Func& func)
+  {
+    const int size = mat.size();
+    const int packetSize = ei_packet_traits<Scalar>::size;
+    const int alignedStart =  (Derived::Flags & AlignedBit)
+                           || !(Derived::Flags & DirectAccessBit)
+                           ? 0
+                           : ei_alignmentOffset(&mat.const_cast_derived().coeffRef(0), size);
+    enum {
+      alignment = (Derived::Flags & DirectAccessBit) || (Derived::Flags & AlignedBit)
+                ? Aligned : Unaligned
+    };
+    const int alignedSize = ((size-alignedStart)/packetSize)*packetSize;
+    const int alignedEnd = alignedStart + alignedSize;
+    Scalar res;
+    if(alignedSize)
+    {
+      PacketScalar packet_res = mat.template packet<alignment>(alignedStart);
+      for(int index = alignedStart + packetSize; index < alignedEnd; index += packetSize)
+        packet_res = func.packetOp(packet_res, mat.template packet<alignment>(index));
+      res = func.predux(packet_res);
+      
+      for(int index = 0; index < alignedStart; ++index)
+        res = func(res,mat.coeff(index));
+
+      for(int index = alignedEnd; index < size; ++index)
+        res = func(res,mat.coeff(index));
+    }
+    else // too small to vectorize anything.
+         // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
+    {
+      res = mat.coeff(0);
+      for(int index = 1; index < size; ++index)
+        res = func(res,mat.coeff(index));
+    }
+
+    return res;
+  }
+};
+
+template<typename Func, typename Derived>
+struct ei_redux_impl<Func, Derived, SliceVectorization, NoUnrolling>
+{
+  typedef typename Derived::Scalar Scalar;
+  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+
+  static Scalar run(const Derived& mat, const Func& func)
+  {
+    const int innerSize = mat.innerSize();
+    const int outerSize = mat.outerSize();
+    enum {
+      packetSize = ei_packet_traits<Scalar>::size,
+      isRowMajor = Derived::Flags&RowMajorBit?1:0
+    };
+    const int packetedInnerSize = ((innerSize)/packetSize)*packetSize;
+    Scalar res;
+    if(packetedInnerSize)
+    {
+      PacketScalar packet_res = mat.template packet<Unaligned>(0,0);
+      for(int j=0; j<outerSize; ++j)
+        for(int i=0; i<packetedInnerSize; i+=int(packetSize))
+          packet_res = func.packetOp(packet_res, mat.template packet<Unaligned>
+                                                 (isRowMajor?j:i, isRowMajor?i:j));
+      
+      res = func.predux(packet_res);
+      for(int j=0; j<outerSize; ++j)
+        for(int i=packetedInnerSize; i<innerSize; ++i)
+          res = func(res, mat.coeff(isRowMajor?j:i, isRowMajor?i:j));
+    }
+    else // too small to vectorize anything.
+         // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
+    {
+      res = ei_redux_impl<Func, Derived, NoVectorization, NoUnrolling>::run(mat, func);
+    }
+
+    return res;
+  }
+};
+
+template<typename Func, typename Derived>
+struct ei_redux_impl<Func, Derived, LinearVectorization, CompleteUnrolling>
+{
+  typedef typename Derived::Scalar Scalar;
+  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+  enum {
+    PacketSize = ei_packet_traits<Scalar>::size,
+    Size = Derived::SizeAtCompileTime,
+    VectorizationSize = (Size / PacketSize) * PacketSize
+  };
+  EIGEN_STRONG_INLINE static Scalar run(const Derived& mat, const Func& func)
+  {
+    Scalar res = func.predux(ei_redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
+    if (VectorizationSize != Size)
+      res = func(res,ei_redux_novec_unroller<Func, Derived, VectorizationSize, Size-VectorizationSize>::run(mat,func));
+    return res;
+  }
+};
+
+
 /** \returns the result of a full redux operation on the whole matrix or vector using \a func
  *
  * The template parameter \a BinaryOp is the type of the functor \a func which must be
@@ -85,21 +302,20 @@ struct ei_redux_impl<BinaryOp, Derived, Start, Dynamic>
  * \sa MatrixBase::sum(), MatrixBase::minCoeff(), MatrixBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise()
  */
 template<typename Derived>
-template<typename BinaryOp>
-typename ei_result_of<BinaryOp(typename ei_traits<Derived>::Scalar)>::type
-MatrixBase<Derived>::redux(const BinaryOp& func) const
+template<typename Func>
+inline typename ei_result_of<Func(typename ei_traits<Derived>::Scalar)>::type
+MatrixBase<Derived>::redux(const Func& func) const
 {
-  const bool unroll = SizeAtCompileTime * CoeffReadCost
-                    + (SizeAtCompileTime-1) * ei_functor_traits<BinaryOp>::Cost
-                    <= EIGEN_UNROLLING_LIMIT;
-  return ei_redux_impl<BinaryOp, Derived, 0, unroll ? int(SizeAtCompileTime) : Dynamic>
-            ::run(derived(), func);
+  typename Derived::Nested nested(derived());
+  typedef typename ei_cleantype<typename Derived::Nested>::type ThisNested;
+  return ei_redux_impl<Func, ThisNested>
+            ::run(nested, func);
 }

 /** \returns the minimum of all coefficients of *this
  */
 template<typename Derived>
-inline typename ei_traits<Derived>::Scalar
+EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar
 MatrixBase<Derived>::minCoeff() const
 {
  return this->redux(Eigen::ei_scalar_min_op<Scalar>());
@@ -108,10 +324,48 @@ MatrixBase<Derived>::minCoeff() const
 /** \returns the maximum of all coefficients of *this
  */
 template<typename Derived>
-inline typename ei_traits<Derived>::Scalar
+EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar
 MatrixBase<Derived>::maxCoeff() const
 {
  return this->redux(Eigen::ei_scalar_max_op<Scalar>());
 }

+/** \returns the sum of all coefficients of *this
+  *
+  * \sa trace(), prod()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar
+MatrixBase<Derived>::sum() const
+{
+  return this->redux(Eigen::ei_scalar_sum_op<Scalar>());
+}
+
+/** \returns the product of all coefficients of *this
+  *
+  * Example: \include MatrixBase_prod.cpp
+  * Output: \verbinclude MatrixBase_prod.out
+  *
+  * \sa sum()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar
+MatrixBase<Derived>::prod() const
+{
+  return this->redux(Eigen::ei_scalar_product_op<Scalar>());
+}
+
+/** \returns the trace of \c *this, i.e. the sum of the coefficients on the main diagonal.
+  *
+  * \c *this can be any matrix, not necessarily square.
+  *
+  * \sa diagonal(), sum()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar
+MatrixBase<Derived>::trace() const
+{
+  return diagonal().sum();
+}
+
 #endif // EIGEN_REDUX_H
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@@ -0,0 +1,73 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_RETURNBYVALUE_H
+#define EIGEN_RETURNBYVALUE_H
+
+/** \class ReturnByValue
+  *
+  */
+template<typename Functor, typename _Scalar,int _Rows,int _Cols,int _Options,int _MaxRows,int _MaxCols>
+struct ei_traits<ReturnByValue<Functor,Matrix<_Scalar,_Rows,_Cols,_Options,_MaxRows,_MaxCols> > >
+  : public ei_traits<Matrix<_Scalar,_Rows,_Cols,_Options,_MaxRows,_MaxCols> >
+{
+  enum {
+    Flags = ei_traits<Matrix<_Scalar,_Rows,_Cols,_Options,_MaxRows,_MaxCols> >::Flags | EvalBeforeNestingBit
+  };
+};
+
+template<typename Functor,typename EvalTypeDerived,int n>
+struct ei_nested<ReturnByValue<Functor,MatrixBase<EvalTypeDerived> >, n, EvalTypeDerived>
+{
+  typedef EvalTypeDerived type;
+};
+
+template<typename Functor, typename EvalType> class ReturnByValue
+{
+  public:
+    template<typename Dest>
+    inline void evalTo(Dest& dst) const
+    { static_cast<const Functor*>(this)->evalTo(dst); }
+};
+
+template<typename Functor, typename _Scalar,int _Rows,int _Cols,int _Options,int _MaxRows,int _MaxCols>
+  class ReturnByValue<Functor,Matrix<_Scalar,_Rows,_Cols,_Options,_MaxRows,_MaxCols> >
+  : public MatrixBase<ReturnByValue<Functor,Matrix<_Scalar,_Rows,_Cols,_Options,_MaxRows,_MaxCols> > >
+{
+  public:
+    EIGEN_GENERIC_PUBLIC_INTERFACE(ReturnByValue)
+    template<typename Dest>
+    inline void evalTo(Dest& dst) const
+    { static_cast<const Functor* const>(this)->evalTo(dst); }
+};
+
+template<typename Derived>
+template<typename OtherDerived,typename OtherEvalType>
+Derived& MatrixBase<Derived>::operator=(const ReturnByValue<OtherDerived,OtherEvalType>& other)
+{
+  other.evalTo(derived());
+  return derived();
+}
+
+#endif // EIGEN_RETURNBYVALUE_H
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -33,8 +33,8 @@ template<typename Lhs, typename Rhs,
                     ? LowerTriangular
                     : (int(Lhs::Flags) & UpperTriangularBit)
                     ? UpperTriangular
-                     : -1,
-  int StorageOrder = ei_is_part<Lhs>::value ? -1  // this is to solve ambiguous specializations
+                     : 0xffffff,
+  int StorageOrder = ei_is_part<Lhs>::value ? 0xffffff  // this is to solve ambiguous specializations
                   : int(Lhs::Flags) & (RowMajorBit|SparseBit)
  >
 struct ei_solve_triangular_selector;
--- a/Eigen/src/Core/Sum.h
+++ b/Eigen/src/Core/Sum.h
@@ -1,271 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// Eigen is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 3 of the License, or (at your option) any later version.
-//
-// Alternatively, you can redistribute it and/or
-// modify it under the terms of the GNU General Public License as
-// published by the Free Software Foundation; either version 2 of
-// the License, or (at your option) any later version.
-//
-// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
-// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU Lesser General Public
-// License and a copy of the GNU General Public License along with
-// Eigen. If not, see <http://www.gnu.org/licenses/>.
-
-#ifndef EIGEN_SUM_H
-#define EIGEN_SUM_H
-
-/***************************************************************************
-* Part 1 : the logic deciding a strategy for vectorization and unrolling
-***************************************************************************/
-
-template<typename Derived>
-struct ei_sum_traits
-{
-private:
-  enum {
-    PacketSize = ei_packet_traits<typename Derived::Scalar>::size
-  };
-
-public:
-  enum {
-    Vectorization = (int(Derived::Flags)&ActualPacketAccessBit)
-                 && (int(Derived::Flags)&LinearAccessBit)
-                  ? LinearVectorization
-                  : NoVectorization
-  };
-
-private:
-  enum {
-    Cost = Derived::SizeAtCompileTime * Derived::CoeffReadCost
-           + (Derived::SizeAtCompileTime-1) * NumTraits<typename Derived::Scalar>::AddCost,
-    UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Vectorization) == int(NoVectorization) ? 1 : int(PacketSize))
-  };
-
-public:
-  enum {
-    Unrolling = Cost <= UnrollingLimit
-              ? CompleteUnrolling
-              : NoUnrolling
-  };
-};
-
-/***************************************************************************
-* Part 2 : unrollers
-***************************************************************************/
-
-/*** no vectorization ***/
-
-template<typename Derived, int Start, int Length>
-struct ei_sum_novec_unroller
-{
-  enum {
-    HalfLength = Length/2
-  };
-
-  typedef typename Derived::Scalar Scalar;
-
-  inline static Scalar run(const Derived &mat)
-  {
-    return ei_sum_novec_unroller<Derived, Start, HalfLength>::run(mat)
-         + ei_sum_novec_unroller<Derived, Start+HalfLength, Length-HalfLength>::run(mat);
-  }
-};
-
-template<typename Derived, int Start>
-struct ei_sum_novec_unroller<Derived, Start, 1>
-{
-  enum {
-    col = Start / Derived::RowsAtCompileTime,
-    row = Start % Derived::RowsAtCompileTime
-  };
-
-  typedef typename Derived::Scalar Scalar;
-
-  inline static Scalar run(const Derived &mat)
-  {
-    return mat.coeff(row, col);
-  }
-};
-
-/*** vectorization ***/
-  
-template<typename Derived, int Start, int Length>
-struct ei_sum_vec_unroller
-{
-  enum {
-    PacketSize = ei_packet_traits<typename Derived::Scalar>::size,
-    HalfLength = Length/2
-  };
-
-  typedef typename Derived::Scalar Scalar;
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
-
-  inline static PacketScalar run(const Derived &mat)
-  {
-    return ei_padd(
-            ei_sum_vec_unroller<Derived, Start, HalfLength>::run(mat),
-            ei_sum_vec_unroller<Derived, Start+HalfLength, Length-HalfLength>::run(mat) );
-  }
-};
-
-template<typename Derived, int Start>
-struct ei_sum_vec_unroller<Derived, Start, 1>
-{
-  enum {
-    index = Start * ei_packet_traits<typename Derived::Scalar>::size,
-    row = int(Derived::Flags)&RowMajorBit
-        ? index / int(Derived::ColsAtCompileTime)
-        : index % Derived::RowsAtCompileTime,
-    col = int(Derived::Flags)&RowMajorBit
-        ? index % int(Derived::ColsAtCompileTime)
-        : index / Derived::RowsAtCompileTime,
-    alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned
-  };
-
-  typedef typename Derived::Scalar Scalar;
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
-
-  inline static PacketScalar run(const Derived &mat)
-  {
-    return mat.template packet<alignment>(row, col);
-  }
-};
-
-/***************************************************************************
-* Part 3 : implementation of all cases
-***************************************************************************/
-
-template<typename Derived,
-         int Vectorization = ei_sum_traits<Derived>::Vectorization,
-         int Unrolling = ei_sum_traits<Derived>::Unrolling
->
-struct ei_sum_impl;
-
-template<typename Derived>
-struct ei_sum_impl<Derived, NoVectorization, NoUnrolling>
-{
-  typedef typename Derived::Scalar Scalar;
-  static Scalar run(const Derived& mat)
-  {
-    ei_assert(mat.rows()>0 && mat.cols()>0 && "you are using a non initialized matrix");
-    Scalar res;
-    res = mat.coeff(0, 0);
-    for(int i = 1; i < mat.rows(); ++i)
-      res += mat.coeff(i, 0);
-    for(int j = 1; j < mat.cols(); ++j)
-      for(int i = 0; i < mat.rows(); ++i)
-        res += mat.coeff(i, j);
-    return res;
-  }
-};
-
-template<typename Derived>
-struct ei_sum_impl<Derived, NoVectorization, CompleteUnrolling>
-  : public ei_sum_novec_unroller<Derived, 0, Derived::SizeAtCompileTime>
-{};
-
-template<typename Derived>
-struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
-{
-  typedef typename Derived::Scalar Scalar;
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
-
-  static Scalar run(const Derived& mat)
-  {
-    const int size = mat.size();
-    const int packetSize = ei_packet_traits<Scalar>::size;
-    const int alignedStart =  (Derived::Flags & AlignedBit)
-                           || !(Derived::Flags & DirectAccessBit)
-                           ? 0
-                           : ei_alignmentOffset(&mat.const_cast_derived().coeffRef(0), size);
-    enum {
-      alignment = (Derived::Flags & DirectAccessBit) || (Derived::Flags & AlignedBit)
-                ? Aligned : Unaligned
-    };
-    const int alignedSize = ((size-alignedStart)/packetSize)*packetSize;
-    const int alignedEnd = alignedStart + alignedSize;
-    Scalar res;
-
-    if(alignedSize)
-    {
-      PacketScalar packet_res = mat.template packet<alignment>(alignedStart);
-      for(int index = alignedStart + packetSize; index < alignedEnd; index += packetSize)
-        packet_res = ei_padd(packet_res, mat.template packet<alignment>(index));
-      res = ei_predux(packet_res);
-    }
-    else // too small to vectorize anything.
-         // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
-    {
-      res = Scalar(0);
-    }
-
-    for(int index = 0; index < alignedStart; ++index)
-      res += mat.coeff(index);
-
-    for(int index = alignedEnd; index < size; ++index)
-      res += mat.coeff(index);
-
-    return res;
-  }
-};
-
-template<typename Derived>
-struct ei_sum_impl<Derived, LinearVectorization, CompleteUnrolling>
-{
-  typedef typename Derived::Scalar Scalar;
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
-  enum {
-    PacketSize = ei_packet_traits<Scalar>::size,
-    Size = Derived::SizeAtCompileTime,
-    VectorizationSize = (Size / PacketSize) * PacketSize
-  };
-  static Scalar run(const Derived& mat)
-  {
-    Scalar res = ei_predux(ei_sum_vec_unroller<Derived, 0, Size / PacketSize>::run(mat));
-    if (VectorizationSize != Size)
-      res += ei_sum_novec_unroller<Derived, VectorizationSize, Size-VectorizationSize>::run(mat);
-    return res;
-  }
-};
-
-/***************************************************************************
-* Part 4 : implementation of MatrixBase methods
-***************************************************************************/
-
-/** \returns the sum of all coefficients of *this
-  *
-  * \sa trace()
-  */
-template<typename Derived>
-inline typename ei_traits<Derived>::Scalar
-MatrixBase<Derived>::sum() const
-{
-  return ei_sum_impl<Derived>::run(derived());
-}
-
-/** \returns the trace of \c *this, i.e. the sum of the coefficients on the main diagonal.
-  *
-  * \c *this can be any matrix, not necessarily square.
-  *
-  * \sa diagonal(), sum()
-  */
-template<typename Derived>
-inline typename ei_traits<Derived>::Scalar
-MatrixBase<Derived>::trace() const
-{
-  return diagonal().sum();
-}
-
-#endif // EIGEN_SUM_H
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -77,21 +77,21 @@ template<typename MatrixType> class Transpose
      return m_matrix.const_cast_derived().coeffRef(col, row);
    }

-    inline const Scalar coeff(int row, int col) const
+    inline Scalar& coeffRef(int index)
+    {
+      return m_matrix.const_cast_derived().coeffRef(index);
+    }
+
+    inline const CoeffReturnType coeff(int row, int col) const
    {
      return m_matrix.coeff(col, row);
    }

-    inline const Scalar coeff(int index) const
+    inline const CoeffReturnType coeff(int index) const
    {
      return m_matrix.coeff(index);
    }

-    inline Scalar& coeffRef(int index)
-    {
-      return m_matrix.const_cast_derived().coeffRef(index);
-    }
-
    template<int LoadMode>
    inline const PacketScalar packet(int row, int col) const
    {
@@ -125,7 +125,20 @@ template<typename MatrixType> class Transpose
  * Example: \include MatrixBase_transpose.cpp
  * Output: \verbinclude MatrixBase_transpose.out
  *
-  * \sa adjoint(), class DiagonalCoeffs */
+  * \warning If you want to replace a matrix by its own transpose, do \b NOT do this:
+  * \code
+  * m = m.transpose(); // bug!!! caused by aliasing effect
+  * \endcode
+  * Instead, use the transposeInPlace() method:
+  * \code
+  * m.transposeInPlace();
+  * \endcode
+  * which gives Eigen good opportunities for optimization, or alternatively you can also do:
+  * \code
+  * m = m.transpose().eval();
+  * \endcode
+  *
+  * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
 inline Transpose<Derived>
 MatrixBase<Derived>::transpose()
@@ -133,7 +146,11 @@ MatrixBase<Derived>::transpose()
  return derived();
 }

-/** This is the const version of transpose(). \sa adjoint() */
+/** This is the const version of transpose().
+  *
+  * Make sure you read the warning for transpose() !
+  *
+  * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
 inline const Transpose<Derived>
 MatrixBase<Derived>::transpose() const
@@ -146,7 +163,20 @@ MatrixBase<Derived>::transpose() const
  * Example: \include MatrixBase_adjoint.cpp
  * Output: \verbinclude MatrixBase_adjoint.out
  *
-  * \sa transpose(), conjugate(), class Transpose, class ei_scalar_conjugate_op */
+  * \warning If you want to replace a matrix by its own adjoint, do \b NOT do this:
+  * \code
+  * m = m.adjoint(); // bug!!! caused by aliasing effect
+  * \endcode
+  * Instead, use the adjointInPlace() method:
+  * \code
+  * m.adjointInPlace();
+  * \endcode
+  * which gives Eigen good opportunities for optimization, or alternatively you can also do:
+  * \code
+  * m = m.adjoint().eval();
+  * \endcode
+  *
+  * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class ei_scalar_conjugate_op */
 template<typename Derived>
 inline const typename MatrixBase<Derived>::AdjointReturnType
 MatrixBase<Derived>::adjoint() const
@@ -179,24 +209,56 @@ struct ei_inplace_transpose_selector<MatrixType,false> { // non square matrix
  }
 };

-/** This is the "in place" version of transpose: it transposes \c *this.
+/** This is the "in place" version of transpose(): it replaces \c *this by its own transpose.
+  * Thus, doing
+  * \code
+  * m.transposeInPlace();
+  * \endcode
+  * has the same effect on m as doing
+  * \code
+  * m = m.transpose().eval();
+  * \endcode
+  * and is faster and also safer because in the latter line of code, forgetting the eval() results
+  * in a bug caused by aliasing.
  *
-  * In most cases it is probably better to simply use the transposed expression
-  * of a matrix. However, when transposing the matrix data itself is really needed,
-  * then this "in-place" version is probably the right choice because it provides
-  * the following additional features:
-  *  - less error prone: doing the same operation with .transpose() requires special care:
-  *    \code m = m.transpose().eval(); \endcode
-  *  - no temporary object is created (currently only for squared matrices)
-  *  - it allows future optimizations (cache friendliness, etc.)
+  * Notice however that this method is only useful if you want to replace a matrix by its own transpose.
+  * If you just need the transpose of a matrix, use transpose().
  *
  * \note if the matrix is not square, then \c *this must be a resizable matrix.
  *
-  * \sa transpose(), adjoint() */
+  * \sa transpose(), adjoint(), adjointInPlace() */
 template<typename Derived>
 inline void MatrixBase<Derived>::transposeInPlace()
 {
  ei_inplace_transpose_selector<Derived>::run(derived());
 }

+/***************************************************************************
+* "in place" adjoint implementation
+***************************************************************************/
+
+/** This is the "in place" version of adjoint(): it replaces \c *this by its own transpose.
+  * Thus, doing
+  * \code
+  * m.adjointInPlace();
+  * \endcode
+  * has the same effect on m as doing
+  * \code
+  * m = m.adjoint().eval();
+  * \endcode
+  * and is faster and also safer because in the latter line of code, forgetting the eval() results
+  * in a bug caused by aliasing.
+  *
+  * Notice however that this method is only useful if you want to replace a matrix by its own adjoint.
+  * If you just need the adjoint of a matrix, use adjoint().
+  *
+  * \note if the matrix is not square, then \c *this must be a resizable matrix.
+  *
+  * \sa transpose(), adjoint(), transposeInPlace() */
+template<typename Derived>
+inline void MatrixBase<Derived>::adjointInPlace()
+{
+  derived() = adjoint().eval();
+}
+
 #endif // EIGEN_TRANSPOSE_H
--- a/Eigen/src/Core/arch/AltiVec/CMakeLists.txt
+++ b/Eigen/src/Core/arch/AltiVec/CMakeLists.txt
@@ -2,5 +2,5 @@ FILE(GLOB Eigen_Core_arch_AltiVec_SRCS "*.h")

 INSTALL(FILES
  ${Eigen_Core_arch_AltiVec_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/AltiVec
-)
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/AltiVec COMPONENT Devel
+)
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -45,8 +45,10 @@ typedef __vector __bool int     v4bi;
 #define USE_CONST_v1i_    const v4ui  v1i_  = vec_splat_u32(-1)
 #define USE_CONST_v0f_    USE_CONST_v1i_; const v4f v0f_ = (v4f) vec_sl(v1i_, v1i_)

-template<> struct ei_packet_traits<float>  { typedef v4f type; enum {size=4}; };
-template<> struct ei_packet_traits<int>    { typedef v4i type; enum {size=4}; };
+template<> struct ei_packet_traits<float> : ei_default_packet_traits
+{ typedef v4f type; enum {size=4}; };
+template<> struct ei_packet_traits<int>   : ei_default_packet_traits
+{ typedef v4i type; enum {size=4}; };

 template<> struct ei_unpacket_traits<v4f>  { typedef float  type; enum {size=4}; };
 template<> struct ei_unpacket_traits<v4i>  { typedef int    type; enum {size=4}; };
@@ -101,6 +103,18 @@ template<> inline v4i  ei_padd(const v4i&   a, const v4i&   b) { return vec_add(
 template<> inline v4f  ei_psub(const v4f&   a, const v4f&   b) { return vec_sub(a,b); }
 template<> inline v4i  ei_psub(const v4i&   a, const v4i&   b) { return vec_sub(a,b); }

+template<> EIGEN_STRONG_INLINE v4f ei_pnegate(const v4f& a)
+{
+  v4i mask = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+  return vec_xor(a,(v4f) mask);
+}
+
+template<> EIGEN_STRONG_INLINE v4i ei_pnegate(const v4i& a)
+{
+  USE_CONST_v0i;
+  return ei_psub(v0i, a);
+}
+
 template<> inline v4f  ei_pmul(const v4f&   a, const v4f&   b) { USE_CONST_v0f; return vec_madd(a,b, v0f); }
 template<> inline v4i  ei_pmul(const v4i&   a, const v4i&   b)
 {
@@ -111,7 +125,7 @@ template<> inline v4i  ei_pmul(const v4i&   a, const v4i&   b)
  USE_CONST_v1i;
  USE_CONST_v16i_;

-  // Get the absolute values 
+  // Get the absolute values
  a1  = vec_abs(a);
  b1  = vec_abs(b);

@@ -146,7 +160,7 @@ template<> inline v4f  ei_pdiv(const v4f&   a, const v4f&   b) {

  // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
  y_0 = vec_re(b);
-  
+
  // Do one Newton-Raphson iteration to get the needed accuracy
  t = vec_nmsub(y_0, b, v1f);
  y_1 = vec_madd(y_0, t, y_0);
@@ -163,6 +177,9 @@ template<> inline v4i  ei_pmin(const v4i&   a, const v4i&   b) { return vec_min(
 template<> inline v4f  ei_pmax(const v4f&   a, const v4f&   b) { return vec_max(a,b); }
 template<> inline v4i  ei_pmax(const v4i&   a, const v4i&   b) { return vec_max(a,b); }

+template<> EIGEN_STRONG_INLINE v4f ei_pabs(const v4f& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE v4i ei_pabs(const v4i& a) { return vec_abs(a); }
+
 template<> inline v4f  ei_pload(const float* from) { return vec_ld(0, from); }
 template<> inline v4i  ei_pload(const int*   from) { return vec_ld(0, from); }

@@ -177,7 +194,7 @@ template<> inline v4f  ei_ploadu(const float*  from)
  return (v4f) vec_perm(MSQ, LSQ, mask);           // align the data
 }

-template<> inline v4i    ei_ploadu(const int*    from)
+template<> inline v4i  ei_ploadu(const int*    from)
 {
  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
  __vector unsigned char MSQ, LSQ;
@@ -198,7 +215,7 @@ template<> inline v4f  ei_pset1(const float&  from)
  return vc;
 }

-template<> inline v4i    ei_pset1(const int&    from)
+template<> inline v4i  ei_pset1(const int&    from)
 {
  int __attribute__(aligned(16)) ai[4];
  ai[0] = from;
@@ -248,18 +265,31 @@ template<> inline void ei_pstoreu(int*    to , const v4i&    from )

 template<> inline float  ei_pfirst(const v4f&  a)
 {
-  float __attribute__(aligned(16)) af[4];
+  float EIGEN_ALIGN_128 af[4];
  vec_st(a, 0, af);
  return af[0];
 }

 template<> inline int    ei_pfirst(const v4i&  a)
 {
-  int __attribute__(aligned(16)) ai[4];
+  int EIGEN_ALIGN_128 ai[4];
  vec_st(a, 0, ai);
  return ai[0];
 }

+template<> EIGEN_STRONG_INLINE v4f ei_preverse(const v4f& a)
+{
+  static const __vector unsigned char reverse_mask =
+    {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
+  return (v4f)vec_perm((__vector unsigned char)a,(__vector unsigned char)a,reverse_mask);
+}
+template<> EIGEN_STRONG_INLINE v4i ei_preverse(const v4i& a)
+{
+  static const __vector unsigned char __attribute__(aligned(16)) reverse_mask =
+    {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
+  return (v4i)vec_perm((__vector unsigned char)a,(__vector unsigned char)a,reverse_mask);
+}
+
 inline v4f ei_preduxp(const v4f* vecs)
 {
  v4f v[4], sum[4];
@@ -287,6 +317,33 @@ inline v4f ei_preduxp(const v4f* vecs)
  return sum[0];
 }

+inline v4i  ei_preduxp(const v4i* vecs)
+{
+  v4i v[4], sum[4];
+
+  // It's easier and faster to transpose then add as columns
+  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
+  // Do the transpose, first set of moves
+  v[0] = vec_mergeh(vecs[0], vecs[2]);
+  v[1] = vec_mergel(vecs[0], vecs[2]);
+  v[2] = vec_mergeh(vecs[1], vecs[3]);
+  v[3] = vec_mergel(vecs[1], vecs[3]);
+  // Get the resulting vectors
+  sum[0] = vec_mergeh(v[0], v[2]);
+  sum[1] = vec_mergel(v[0], v[2]);
+  sum[2] = vec_mergeh(v[1], v[3]);
+  sum[3] = vec_mergel(v[1], v[3]);
+
+  // Now do the summation:
+  // Lines 0+1
+  sum[0] = vec_add(sum[0], sum[1]);
+  // Lines 2+3
+  sum[1] = vec_add(sum[2], sum[3]);
+  // Add the results
+  sum[0] = vec_add(sum[0], sum[1]);
+  return sum[0];
+}
+
 inline float ei_predux(const v4f& a)
 {
  v4f b, sum;
@@ -297,33 +354,6 @@ inline float ei_predux(const v4f& a)
  return ei_pfirst(sum);
 }

-inline v4i  ei_preduxp(const v4i* vecs)
-{
-  v4i v[4], sum[4];
-
-  // It's easier and faster to transpose then add as columns
-  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
-  // Do the transpose, first set of moves
-  v[0] = vec_mergeh(vecs[0], vecs[2]);
-  v[1] = vec_mergel(vecs[0], vecs[2]);
-  v[2] = vec_mergeh(vecs[1], vecs[3]);
-  v[3] = vec_mergel(vecs[1], vecs[3]);
-  // Get the resulting vectors
-  sum[0] = vec_mergeh(v[0], v[2]);
-  sum[1] = vec_mergel(v[0], v[2]);
-  sum[2] = vec_mergeh(v[1], v[3]);
-  sum[3] = vec_mergel(v[1], v[3]);
-
-  // Now do the summation:
-  // Lines 0+1
-  sum[0] = vec_add(sum[0], sum[1]);
-  // Lines 2+3
-  sum[1] = vec_add(sum[2], sum[3]);
-  // Add the results
-  sum[0] = vec_add(sum[0], sum[1]);
-  return sum[0];
-}
-
 inline int ei_predux(const v4i& a)
 {
  USE_CONST_v0i;
@@ -333,6 +363,55 @@ inline int ei_predux(const v4i& a)
  return ei_pfirst(sum);
 }

+// implement other reductions operators
+inline float ei_predux_mul(const v4f& a)
+{
+  v4f prod;
+  prod = ei_pmul(a, (v4f)vec_sld(a, a, 8));
+  return ei_pfirst(ei_pmul(prod, (v4f)vec_sld(prod, prod, 4)));
+}
+
+inline int ei_predux_mul(const v4i& a)
+{
+  EIGEN_ALIGN_128 int aux[4];
+  ei_pstore(aux, a);
+  return aux[0] * aux[1] * aux[2] * aux[3];
+}
+
+inline float ei_predux_min(const v4f& a)
+{
+  v4f b, res;
+  b = vec_min(a, vec_sld(a, a, 8));
+  res = vec_min(b, vec_sld(b, b, 4));
+  return ei_pfirst(res);
+}
+
+inline int ei_predux_min(const v4i& a)
+{
+  v4i b, res;
+  b = vec_min(a, vec_sld(a, a, 8));
+  res = vec_min(b, vec_sld(b, b, 4));
+  return ei_pfirst(res);
+}
+
+inline float ei_predux_max(const v4f& a)
+{
+  v4f b, res;
+  b = vec_max(a, vec_sld(a, a, 8));
+  res = vec_max(b, vec_sld(b, b, 4));
+  return ei_pfirst(res);
+}
+
+inline int ei_predux_max(const v4i& a)
+{
+  v4i b, res;
+  b = vec_max(a, vec_sld(a, a, 8));
+  res = vec_max(b, vec_sld(b, b, 4));
+  return ei_pfirst(res);
+}
+
+
+
 template<int Offset>
 struct ei_palign_impl<Offset, v4f>
 {
--- a/Eigen/src/Core/arch/SSE/CMakeLists.txt
+++ b/Eigen/src/Core/arch/SSE/CMakeLists.txt
@@ -2,5 +2,5 @@ FILE(GLOB Eigen_Core_arch_SSE_SRCS "*.h")

 INSTALL(FILES
  ${Eigen_Core_arch_SSE_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/SSE
-)
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/SSE COMPONENT Devel
+)
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -0,0 +1,376 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+#ifndef EIGEN_MATH_FUNCTIONS_SSE_H
+#define EIGEN_MATH_FUNCTIONS_SSE_H
+
+static EIGEN_DONT_INLINE EIGEN_UNUSED Packet4f ei_plog(Packet4f x)
+{
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
+
+  /* the smallest non denormalized float number */
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
+
+  /* natural logarithm computed for 4 simultaneous float
+    return NaN for x <= 0
+  */
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+
+
+  Packet4i emm0;
+
+  Packet4f invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
+
+  x = ei_pmax(x, ei_p4f_min_norm_pos);  /* cut off denormalized stuff */
+  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
+
+  /* keep only the fractional part */
+  x = _mm_and_ps(x, ei_p4f_inv_mant_mask);
+  x = _mm_or_ps(x, ei_p4f_half);
+
+  emm0 = _mm_sub_epi32(emm0, ei_p4i_0x7f);
+  Packet4f e = ei_padd(_mm_cvtepi32_ps(emm0), ei_p4f_1);
+
+  /* part2:
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  Packet4f mask = _mm_cmplt_ps(x, ei_p4f_cephes_SQRTHF);
+  Packet4f tmp = _mm_and_ps(x, mask);
+  x = ei_psub(x, ei_p4f_1);
+  e = ei_psub(e, _mm_and_ps(ei_p4f_1, mask));
+  x = ei_padd(x, tmp);
+
+  Packet4f x2 = ei_pmul(x,x);
+  Packet4f x3 = ei_pmul(x2,x);
+
+  Packet4f y, y1, y2;
+  y  = ei_pmadd(ei_p4f_cephes_log_p0, x, ei_p4f_cephes_log_p1);
+  y1 = ei_pmadd(ei_p4f_cephes_log_p3, x, ei_p4f_cephes_log_p4);
+  y2 = ei_pmadd(ei_p4f_cephes_log_p6, x, ei_p4f_cephes_log_p7);
+  y  = ei_pmadd(y , x, ei_p4f_cephes_log_p2);
+  y1 = ei_pmadd(y1, x, ei_p4f_cephes_log_p5);
+  y2 = ei_pmadd(y2, x, ei_p4f_cephes_log_p8);
+  y = ei_pmadd(y, x3, y1);
+  y = ei_pmadd(y, x3, y2);
+  y = ei_pmul(y, x3);
+
+  y1 = ei_pmul(e, ei_p4f_cephes_log_q1);
+  tmp = ei_pmul(x2, ei_p4f_half);
+  y = ei_padd(y, y1);
+  x = ei_psub(x, tmp);
+  y2 = ei_pmul(e, ei_p4f_cephes_log_q2);
+  x = ei_padd(x, y);
+  x = ei_padd(x, y2);
+  return _mm_or_ps(x, invalid_mask); // negative arg will be NAN
+}
+
+static EIGEN_DONT_INLINE EIGEN_UNUSED Packet4f ei_pexp(Packet4f x)
+{
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+
+
+  _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647949f);
+  _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+
+  Packet4f tmp = _mm_setzero_ps(), fx;
+  Packet4i emm0;
+
+  // clamp x
+  x = ei_pmax(ei_pmin(x, ei_p4f_exp_hi), ei_p4f_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = ei_pmadd(x, ei_p4f_cephes_LOG2EF, ei_p4f_half);
+
+  /* how to perform a floorf with SSE: just below */
+  emm0 = _mm_cvttps_epi32(fx);
+  tmp  = _mm_cvtepi32_ps(emm0);
+  /* if greater, substract 1 */
+  Packet4f mask = _mm_cmpgt_ps(tmp, fx);
+  mask = _mm_and_ps(mask, ei_p4f_1);
+  fx = ei_psub(tmp, mask);
+
+  tmp = ei_pmul(fx, ei_p4f_cephes_exp_C1);
+  Packet4f z = ei_pmul(fx, ei_p4f_cephes_exp_C2);
+  x = ei_psub(x, tmp);
+  x = ei_psub(x, z);
+
+  z = ei_pmul(x,x);
+
+  Packet4f y = ei_p4f_cephes_exp_p0;
+  y = ei_pmadd(y, x, ei_p4f_cephes_exp_p1);
+  y = ei_pmadd(y, x, ei_p4f_cephes_exp_p2);
+  y = ei_pmadd(y, x, ei_p4f_cephes_exp_p3);
+  y = ei_pmadd(y, x, ei_p4f_cephes_exp_p4);
+  y = ei_pmadd(y, x, ei_p4f_cephes_exp_p5);
+  y = ei_pmadd(y, z, x);
+  y = ei_padd(y, ei_p4f_1);
+
+  /* build 2^n */
+  emm0 = _mm_cvttps_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, ei_p4i_0x7f);
+  emm0 = _mm_slli_epi32(emm0, 23);
+  return ei_pmul(y, _mm_castsi128_ps(emm0));
+}
+
+/* evaluation of 4 sines at onces, using SSE2 intrinsics.
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+*/
+
+static EIGEN_DONT_INLINE EIGEN_UNUSED Packet4f ei_psin(Packet4f x)
+{
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+
+  _EIGEN_DECLARE_CONST_Packet4i(1, 1);
+  _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
+  _EIGEN_DECLARE_CONST_Packet4i(2, 2);
+  _EIGEN_DECLARE_CONST_Packet4i(4, 4);
+
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
+
+  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
+  _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(sincof_p1,  8.3321608736E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(coscof_p0,  2.443315711809948E-005f);
+  _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
+  _EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
+
+  Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+
+  Packet4i emm0, emm2;
+  sign_bit = x;
+  /* take the absolute value */
+  x = ei_pabs(x);
+
+  /* take the modulo */
+
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm_and_ps(sign_bit, ei_p4f_sign_mask);
+
+  /* scale by 4/Pi */
+  y = ei_pmul(x, ei_p4f_cephes_FOPI);
+
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, ei_p4i_1);
+  emm2 = _mm_and_si128(emm2, ei_p4i_not1);
+  y = _mm_cvtepi32_ps(emm2);
+  /* get the swap sign flag */
+  emm0 = _mm_and_si128(emm2, ei_p4i_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  emm2 = _mm_and_si128(emm2, ei_p4i_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+
+  Packet4f swap_sign_bit = _mm_castsi128_ps(emm0);
+  Packet4f poly_mask = _mm_castsi128_ps(emm2);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = ei_pmul(y, ei_p4f_minus_cephes_DP1);
+  xmm2 = ei_pmul(y, ei_p4f_minus_cephes_DP2);
+  xmm3 = ei_pmul(y, ei_p4f_minus_cephes_DP3);
+  x = ei_padd(x, xmm1);
+  x = ei_padd(x, xmm2);
+  x = ei_padd(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = ei_p4f_coscof_p0;
+  Packet4f z = _mm_mul_ps(x,x);
+
+  y = ei_pmadd(y, z, ei_p4f_coscof_p1);
+  y = ei_pmadd(y, z, ei_p4f_coscof_p2);
+  y = ei_pmul(y, z);
+  y = ei_pmul(y, z);
+  Packet4f tmp = ei_pmul(z, ei_p4f_half);
+  y = ei_psub(y, tmp);
+  y = ei_padd(y, ei_p4f_1);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  Packet4f y2 = ei_p4f_sincof_p0;
+  y2 = ei_pmadd(y2, z, ei_p4f_sincof_p1);
+  y2 = ei_pmadd(y2, z, ei_p4f_sincof_p2);
+  y2 = ei_pmul(y2, z);
+  y2 = ei_pmul(y2, x);
+  y2 = ei_padd(y2, x);
+
+  /* select the correct result from the two polynoms */
+  y2 = _mm_and_ps(poly_mask, y2);
+  y = _mm_andnot_ps(poly_mask, y);
+  y = _mm_or_ps(y,y2);
+  /* update the sign */
+  return _mm_xor_ps(y, sign_bit);
+}
+
+/* almost the same as ei_psin */
+static EIGEN_DONT_INLINE EIGEN_UNUSED Packet4f ei_pcos(Packet4f x)
+{
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+
+  _EIGEN_DECLARE_CONST_Packet4i(1, 1);
+  _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
+  _EIGEN_DECLARE_CONST_Packet4i(2, 2);
+  _EIGEN_DECLARE_CONST_Packet4i(4, 4);
+
+  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
+  _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(sincof_p1,  8.3321608736E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(coscof_p0,  2.443315711809948E-005f);
+  _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
+  _EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
+
+  Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+  Packet4i emm0, emm2;
+
+  x = ei_pabs(x);
+
+  /* scale by 4/Pi */
+  y = ei_pmul(x, ei_p4f_cephes_FOPI);
+
+  /* get the integer part of y */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, ei_p4i_1);
+  emm2 = _mm_and_si128(emm2, ei_p4i_not1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm2 = _mm_sub_epi32(emm2, ei_p4i_2);
+
+  /* get the swap sign flag */
+  emm0 = _mm_andnot_si128(emm2, ei_p4i_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask */
+  emm2 = _mm_and_si128(emm2, ei_p4i_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+
+  Packet4f sign_bit = _mm_castsi128_ps(emm0);
+  Packet4f poly_mask = _mm_castsi128_ps(emm2);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = ei_pmul(y, ei_p4f_minus_cephes_DP1);
+  xmm2 = ei_pmul(y, ei_p4f_minus_cephes_DP2);
+  xmm3 = ei_pmul(y, ei_p4f_minus_cephes_DP3);
+  x = ei_padd(x, xmm1);
+  x = ei_padd(x, xmm2);
+  x = ei_padd(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = ei_p4f_coscof_p0;
+  Packet4f z = ei_pmul(x,x);
+
+  y = ei_pmadd(y,z,ei_p4f_coscof_p1);
+  y = ei_pmadd(y,z,ei_p4f_coscof_p2);
+  y = ei_pmul(y, z);
+  y = ei_pmul(y, z);
+  Packet4f tmp = _mm_mul_ps(z, ei_p4f_half);
+  y = ei_psub(y, tmp);
+  y = ei_padd(y, ei_p4f_1);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+  Packet4f y2 = ei_p4f_sincof_p0;
+  y2 = ei_pmadd(y2, z, ei_p4f_sincof_p1);
+  y2 = ei_pmadd(y2, z, ei_p4f_sincof_p2);
+  y2 = ei_pmul(y2, z);
+  y2 = ei_pmadd(y2, x, x);
+
+  /* select the correct result from the two polynoms */
+  y2 = _mm_and_ps(poly_mask, y2);
+  y  = _mm_andnot_ps(poly_mask, y);
+  y  = _mm_or_ps(y,y2);
+
+  /* update the sign */
+  return _mm_xor_ps(y, sign_bit);
+}
+
+static EIGEN_UNUSED Packet4f ei_psqrt(Packet4f _x)
+{
+  Packet4f half = ei_pmul(_x, ei_pset1(.5f));
+  Packet4f x = _mm_rsqrt_ps(_x);
+  x = ei_pmul(x, ei_psub(ei_pset1(1.5f), ei_pmul(half, ei_pmul(x,x))));
+  return ei_pmul(_x,x);
+}
+
+#endif // EIGEN_MATH_FUNCTIONS_SSE_H
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra. Eigen itself is part of the KDE project.
 //
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
 //
 // Eigen is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
@@ -29,147 +29,245 @@
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
 #endif

-template<> struct ei_packet_traits<float>  { typedef __m128  type; enum {size=4}; };
-template<> struct ei_packet_traits<double> { typedef __m128d type; enum {size=2}; };
-template<> struct ei_packet_traits<int>    { typedef __m128i type; enum {size=4}; };
+typedef __m128  Packet4f;
+typedef __m128i Packet4i;
+typedef __m128d Packet2d;

-template<> struct ei_unpacket_traits<__m128>  { typedef float  type; enum {size=4}; };
-template<> struct ei_unpacket_traits<__m128d> { typedef double type; enum {size=2}; };
-template<> struct ei_unpacket_traits<__m128i> { typedef int    type; enum {size=4}; };
+#define ei_vec4f_swizzle1(v,p,q,r,s) \
+  (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p)))))

-template<> EIGEN_STRONG_INLINE __m128  ei_pset1<float>(const float&  from) { return _mm_set1_ps(from); }
-template<> EIGEN_STRONG_INLINE __m128d ei_pset1<double>(const double& from) { return _mm_set1_pd(from); }
-template<> EIGEN_STRONG_INLINE __m128i ei_pset1<int>(const int&    from) { return _mm_set1_epi32(from); }
+#define ei_vec4i_swizzle1(v,p,q,r,s) \
+  (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p))))

-template<> EIGEN_STRONG_INLINE __m128  ei_padd<__m128>(const __m128&  a, const __m128&  b) { return _mm_add_ps(a,b); }
-template<> EIGEN_STRONG_INLINE __m128d ei_padd<__m128d>(const __m128d& a, const __m128d& b) { return _mm_add_pd(a,b); }
-template<> EIGEN_STRONG_INLINE __m128i ei_padd<__m128i>(const __m128i& a, const __m128i& b) { return _mm_add_epi32(a,b); }
+#define ei_vec4f_swizzle2(a,b,p,q,r,s) \
+  (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))

-template<> EIGEN_STRONG_INLINE __m128  ei_psub<__m128>(const __m128&  a, const __m128&  b) { return _mm_sub_ps(a,b); }
-template<> EIGEN_STRONG_INLINE __m128d ei_psub<__m128d>(const __m128d& a, const __m128d& b) { return _mm_sub_pd(a,b); }
-template<> EIGEN_STRONG_INLINE __m128i ei_psub<__m128i>(const __m128i& a, const __m128i& b) { return _mm_sub_epi32(a,b); }
+#define ei_vec4i_swizzle2(a,b,p,q,r,s) \
+  (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p))))))
+  
+#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
+  const Packet4f ei_p4f_##NAME = ei_pset1<float>(X)

-template<> EIGEN_STRONG_INLINE __m128  ei_pmul<__m128>(const __m128&  a, const __m128&  b) { return _mm_mul_ps(a,b); }
-template<> EIGEN_STRONG_INLINE __m128d ei_pmul<__m128d>(const __m128d& a, const __m128d& b) { return _mm_mul_pd(a,b); }
-template<> EIGEN_STRONG_INLINE __m128i ei_pmul<__m128i>(const __m128i& a, const __m128i& b)
+#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+  const Packet4f ei_p4f_##NAME = _mm_castsi128_ps(ei_pset1<int>(X))
+
+#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
+  const Packet4i ei_p4i_##NAME = ei_pset1<int>(X)
+
+template<> struct ei_packet_traits<float>  : ei_default_packet_traits
 {
-  return _mm_or_si128(
-    _mm_and_si128(
-      _mm_mul_epu32(a,b),
-      _mm_setr_epi32(0xffffffff,0,0xffffffff,0)),
-    _mm_slli_si128(
-      _mm_and_si128(
-        _mm_mul_epu32(_mm_srli_si128(a,4),_mm_srli_si128(b,4)),
-        _mm_setr_epi32(0xffffffff,0,0xffffffff,0)), 4));
+  typedef Packet4f type; enum {size=4};
+  enum {
+    HasSin  = 1,
+    HasCos  = 1,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1
+  };
+};
+template<> struct ei_packet_traits<double> : ei_default_packet_traits
+{ typedef Packet2d type; enum {size=2}; };
+template<> struct ei_packet_traits<int>    : ei_default_packet_traits
+{ typedef Packet4i type; enum {size=4}; };
+
+template<> struct ei_unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
+template<> struct ei_unpacket_traits<Packet2d> { typedef double type; enum {size=2}; };
+template<> struct ei_unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) { return _mm_set1_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double& from) { return _mm_set1_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int&    from) { return _mm_set1_epi32(from); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pnegate(const Packet4f& a)
+{
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
+  return _mm_xor_ps(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet2d ei_pnegate(const Packet2d& a)
+{
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000));
+  return _mm_xor_pd(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet4i ei_pnegate(const Packet4i& a)
+{
+  return ei_psub(_mm_setr_epi32(0,0,0,0), a);
 }

-template<> EIGEN_STRONG_INLINE __m128  ei_pdiv<__m128>(const __m128&  a, const __m128&  b) { return _mm_div_ps(a,b); }
-template<> EIGEN_STRONG_INLINE __m128d ei_pdiv<__m128d>(const __m128d& a, const __m128d& b) { return _mm_div_pd(a,b); }
-template<> EIGEN_STRONG_INLINE __m128i ei_pdiv<__m128i>(const __m128i& /*a*/, const __m128i& /*b*/)
+template<> EIGEN_STRONG_INLINE Packet4f ei_pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
+{
+  // this version is slightly faster than 4 scalar products
+  return ei_vec4i_swizzle1(
+            ei_vec4i_swizzle2(
+              _mm_mul_epu32(a,b),
+              _mm_mul_epu32(ei_vec4i_swizzle1(a,1,0,3,2),
+                            ei_vec4i_swizzle1(b,1,0,3,2)),
+              0,2,0,2),
+            0,2,1,3);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
 { ei_assert(false && "packet integer division are not supported by SSE");
-  __m128i dummy = ei_pset1<int>(0);
-  return dummy;
+  return ei_pset1<int>(0);
 }

-// for some weird raisons, it has to be overloaded for packet integer
-template<> EIGEN_STRONG_INLINE __m128i ei_pmadd(const __m128i& a, const __m128i& b, const __m128i& c) { return ei_padd(ei_pmul(a,b), c); }
+// for some weird raisons, it has to be overloaded for packet of integers
+template<> EIGEN_STRONG_INLINE Packet4i ei_pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return ei_padd(ei_pmul(a,b), c); }

-template<> EIGEN_STRONG_INLINE __m128  ei_pmin<__m128>(const __m128&  a, const __m128&  b) { return _mm_min_ps(a,b); }
-template<> EIGEN_STRONG_INLINE __m128d ei_pmin<__m128d>(const __m128d& a, const __m128d& b) { return _mm_min_pd(a,b); }
-// FIXME this vectorized min operator is likely to be slower than the standard one
-template<> EIGEN_STRONG_INLINE __m128i ei_pmin<__m128i>(const __m128i& a, const __m128i& b)
+template<> EIGEN_STRONG_INLINE Packet4f ei_pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pmin<Packet4i>(const Packet4i& a, const Packet4i& b)
 {
-  __m128i mask = _mm_cmplt_epi32(a,b);
+  // after some bench, this version *is* faster than a scalar implementation
+  Packet4i mask = _mm_cmplt_epi32(a,b);
  return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
 }

-template<> EIGEN_STRONG_INLINE __m128  ei_pmax<__m128>(const __m128&  a, const __m128&  b) { return _mm_max_ps(a,b); }
-template<> EIGEN_STRONG_INLINE __m128d ei_pmax<__m128d>(const __m128d& a, const __m128d& b) { return _mm_max_pd(a,b); }
-// FIXME this vectorized max operator is likely to be slower than the standard one
-template<> EIGEN_STRONG_INLINE __m128i ei_pmax<__m128i>(const __m128i& a, const __m128i& b)
+template<> EIGEN_STRONG_INLINE Packet4f ei_pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pmax<Packet4i>(const Packet4i& a, const Packet4i& b)
 {
-  __m128i mask = _mm_cmpgt_epi32(a,b);
+  // after some bench, this version *is* faster than a scalar implementation
+  Packet4i mask = _mm_cmpgt_epi32(a,b);
  return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
 }

-template<> EIGEN_STRONG_INLINE __m128  ei_pload<float>(const float*   from) { return _mm_load_ps(from); }
-template<> EIGEN_STRONG_INLINE __m128d ei_pload<double>(const double*  from) { return _mm_load_pd(from); }
-template<> EIGEN_STRONG_INLINE __m128i ei_pload<int>(const int* from) { return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }

-template<> EIGEN_STRONG_INLINE __m128  ei_ploadu<float>(const float*   from) { return _mm_loadu_ps(from); }
-// template<> EIGEN_STRONG_INLINE __m128  ei_ploadu(const float*   from) {
-//   if (size_t(from)&0xF)
-//     return _mm_loadu_ps(from);
-//   else 
-//     return _mm_loadu_ps(from);
-// }
-template<> EIGEN_STRONG_INLINE __m128d ei_ploadu<double>(const double*  from) { return _mm_loadu_pd(from); }
-template<> EIGEN_STRONG_INLINE __m128i ei_ploadu<int>(const int* from) { return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }

-template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float*  to, const __m128&  from) { _mm_store_ps(to, from); }
-template<> EIGEN_STRONG_INLINE void ei_pstore<double>(double* to, const __m128d& from) { _mm_store_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int*    to, const __m128i& from) { _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }

-template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float*  to, const __m128&  from) { _mm_storeu_ps(to, from); }
-template<> EIGEN_STRONG_INLINE void ei_pstoreu<double>(double* to, const __m128d& from) { _mm_storeu_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int*    to, const __m128i& from) { _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float*    from) { return _mm_load_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pload<double>(const double*  from) { return _mm_load_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float*   from) {
+  Packet4f r;
+  r = _mm_castpd_ps(_mm_load_sd((double*)(from)));
+  r = _mm_loadh_pi(r, (const __m64*)(from+2));
+  return r;
+}
+template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double*  from) { return _mm_castps_pd(ei_ploadu((const float*)(from))); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { return _mm_castpd_si128(ei_ploadu((const double*)(from))); }
+
+template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float*   to, const Packet4f& from) { _mm_store_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void ei_pstore<double>(double* to, const Packet2d& from) { _mm_store_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int*       to, const Packet4i& from) { _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); }
+
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<double>(double* to, const Packet2d& from) {
+  _mm_storel_pd((to), from);
+  _mm_storeh_pd((to+1), from);
+}
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float*  to, const Packet4f& from) { ei_pstoreu((double*)to, _mm_castps_pd(from)); }
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int*      to, const Packet4i& from) { ei_pstoreu((double*)to, _mm_castsi128_pd(from)); }

 #ifdef _MSC_VER
 // this fix internal compilation error
-template<> EIGEN_STRONG_INLINE float  ei_pfirst<__m128>(const __m128&  a) { float x = _mm_cvtss_f32(a); return x; }
-template<> EIGEN_STRONG_INLINE double ei_pfirst<__m128d>(const __m128d& a) { double x = _mm_cvtsd_f64(a); return x; }
-template<> EIGEN_STRONG_INLINE int    ei_pfirst<__m128i>(const __m128i& a) { int x = _mm_cvtsi128_si32(a); return x; }
+template<> EIGEN_STRONG_INLINE float  ei_pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
+template<> EIGEN_STRONG_INLINE double ei_pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
+template<> EIGEN_STRONG_INLINE int    ei_pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
 #else
-template<> EIGEN_STRONG_INLINE float  ei_pfirst<__m128>(const __m128&  a) { return _mm_cvtss_f32(a); }
-template<> EIGEN_STRONG_INLINE double ei_pfirst<__m128d>(const __m128d& a) { return _mm_cvtsd_f64(a); }
-template<> EIGEN_STRONG_INLINE int    ei_pfirst<__m128i>(const __m128i& a) { return _mm_cvtsi128_si32(a); }
+template<> EIGEN_STRONG_INLINE float  ei_pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
+template<> EIGEN_STRONG_INLINE double ei_pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
+template<> EIGEN_STRONG_INLINE int    ei_pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
 #endif

+template<> EIGEN_STRONG_INLINE Packet4f ei_preverse(const Packet4f& a)
+{ return _mm_shuffle_ps(a,a,0x1B); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_preverse(const Packet2d& a)
+{ return _mm_shuffle_pd(a,a,0x1); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_preverse(const Packet4i& a)
+{ return _mm_shuffle_epi32(a,0x1B); }
+
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pabs(const Packet4f& a)
+{
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
+  return _mm_and_ps(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet2d ei_pabs(const Packet2d& a)
+{
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
+  return _mm_and_pd(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet4i ei_pabs(const Packet4i& a)
+{
+  #ifdef __SSSE3__
+  return _mm_abs_epi32(a);
+  #else
+  Packet4i aux = _mm_srai_epi32(a,31);
+  return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
+  #endif
+}
+
+
 #ifdef __SSE3__
 // TODO implement SSE2 versions as well as integer versions
-template<> EIGEN_STRONG_INLINE __m128 ei_preduxp<__m128>(const __m128* vecs)
+template<> EIGEN_STRONG_INLINE Packet4f ei_preduxp<Packet4f>(const Packet4f* vecs)
 {
  return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
 }
-template<> EIGEN_STRONG_INLINE __m128d ei_preduxp<__m128d>(const __m128d* vecs)
+template<> EIGEN_STRONG_INLINE Packet2d ei_preduxp<Packet2d>(const Packet2d* vecs)
 {
  return _mm_hadd_pd(vecs[0], vecs[1]);
 }
 // SSSE3 version:
-// EIGEN_STRONG_INLINE __m128i ei_preduxp(const __m128i* vecs)
+// EIGEN_STRONG_INLINE Packet4i ei_preduxp(const Packet4i* vecs)
 // {
 //   return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
 // }

-template<> EIGEN_STRONG_INLINE float ei_predux<__m128>(const __m128& a)
+template<> EIGEN_STRONG_INLINE float ei_predux<Packet4f>(const Packet4f& a)
 {
-  __m128 tmp0 = _mm_hadd_ps(a,a);
+  Packet4f tmp0 = _mm_hadd_ps(a,a);
  return ei_pfirst(_mm_hadd_ps(tmp0, tmp0));
 }

-template<> EIGEN_STRONG_INLINE double ei_predux<__m128d>(const __m128d& a) { return ei_pfirst(_mm_hadd_pd(a, a)); }
+template<> EIGEN_STRONG_INLINE double ei_predux<Packet2d>(const Packet2d& a) { return ei_pfirst(_mm_hadd_pd(a, a)); }

 // SSSE3 version:
-// EIGEN_STRONG_INLINE float ei_predux(const __m128i& a)
+// EIGEN_STRONG_INLINE float ei_predux(const Packet4i& a)
 // {
-//   __m128i tmp0 = _mm_hadd_epi32(a,a);
+//   Packet4i tmp0 = _mm_hadd_epi32(a,a);
 //   return ei_pfirst(_mm_hadd_epi32(tmp0, tmp0));
 // }
 #else
 // SSE2 versions
-template<> EIGEN_STRONG_INLINE float ei_predux<__m128>(const __m128& a)
+template<> EIGEN_STRONG_INLINE float ei_predux<Packet4f>(const Packet4f& a)
 {
-  __m128 tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
+  Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
  return ei_pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
-template<> EIGEN_STRONG_INLINE double ei_predux<__m128d>(const __m128d& a)
+template<> EIGEN_STRONG_INLINE double ei_predux<Packet2d>(const Packet2d& a)
 {
  return ei_pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
 }

-template<> EIGEN_STRONG_INLINE __m128 ei_preduxp<__m128>(const __m128* vecs)
+template<> EIGEN_STRONG_INLINE Packet4f ei_preduxp<Packet4f>(const Packet4f* vecs)
 {
-  __m128 tmp0, tmp1, tmp2;
+  Packet4f tmp0, tmp1, tmp2;
  tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]);
  tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]);
  tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]);
@@ -181,21 +279,21 @@ template<> EIGEN_STRONG_INLINE __m128 ei_preduxp<__m128>(const __m128* vecs)
  return _mm_add_ps(tmp0, tmp2);
 }

-template<> EIGEN_STRONG_INLINE __m128d ei_preduxp<__m128d>(const __m128d* vecs)
+template<> EIGEN_STRONG_INLINE Packet2d ei_preduxp<Packet2d>(const Packet2d* vecs)
 {
  return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1]));
 }
 #endif  // SSE3

-template<> EIGEN_STRONG_INLINE int ei_predux<__m128i>(const __m128i& a)
+template<> EIGEN_STRONG_INLINE int ei_predux<Packet4i>(const Packet4i& a)
 {
-  __m128i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
+  Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
  return ei_pfirst(tmp) + ei_pfirst(_mm_shuffle_epi32(tmp, 1));
 }

-template<> EIGEN_STRONG_INLINE __m128i ei_preduxp<__m128i>(const __m128i* vecs)
+template<> EIGEN_STRONG_INLINE Packet4i ei_preduxp<Packet4i>(const Packet4i* vecs)
 {
-  __m128i tmp0, tmp1, tmp2;
+  Packet4i tmp0, tmp1, tmp2;
  tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
  tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
  tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
@@ -207,16 +305,80 @@ template<> EIGEN_STRONG_INLINE __m128i ei_preduxp<__m128i>(const __m128i* vecs)
  return _mm_add_epi32(tmp0, tmp2);
 }

+// Other reduction functions:
+
+// mul
+template<> EIGEN_STRONG_INLINE float ei_predux_mul<Packet4f>(const Packet4f& a)
+{
+  Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
+  return ei_pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+}
+template<> EIGEN_STRONG_INLINE double ei_predux_mul<Packet2d>(const Packet2d& a)
+{
+  return ei_pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
+}
+template<> EIGEN_STRONG_INLINE int ei_predux_mul<Packet4i>(const Packet4i& a)
+{
+  // after some experiments, it is seems this is the fastest way to implement it
+  // for GCC (eg., reusing ei_pmul is very slow !)
+  // TODO try to call _mm_mul_epu32 directly
+  EIGEN_ALIGN_128 int aux[4];
+  ei_pstore(aux, a);
+  return  (aux[0] * aux[1]) * (aux[2] * aux[3]);;
+}
+
+// min
+template<> EIGEN_STRONG_INLINE float ei_predux_min<Packet4f>(const Packet4f& a)
+{
+  Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
+  return ei_pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+}
+template<> EIGEN_STRONG_INLINE double ei_predux_min<Packet2d>(const Packet2d& a)
+{
+  return ei_pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
+}
+template<> EIGEN_STRONG_INLINE int ei_predux_min<Packet4i>(const Packet4i& a)
+{
+  // after some experiments, it is seems this is the fastest way to implement it
+  // for GCC (eg., it does not like using std::min after the ei_pstore !!)
+  EIGEN_ALIGN_128 int aux[4];
+  ei_pstore(aux, a);
+  register int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
+  register int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
+  return aux0<aux2 ? aux0 : aux2;
+}
+
+// max
+template<> EIGEN_STRONG_INLINE float ei_predux_max<Packet4f>(const Packet4f& a)
+{
+  Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
+  return ei_pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+}
+template<> EIGEN_STRONG_INLINE double ei_predux_max<Packet2d>(const Packet2d& a)
+{
+  return ei_pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
+}
+template<> EIGEN_STRONG_INLINE int ei_predux_max<Packet4i>(const Packet4i& a)
+{
+  // after some experiments, it is seems this is the fastest way to implement it
+  // for GCC (eg., it does not like using std::min after the ei_pstore !!)
+  EIGEN_ALIGN_128 int aux[4];
+  ei_pstore(aux, a);
+  register int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
+  register int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
+  return aux0>aux2 ? aux0 : aux2;
+}
+
 #if (defined __GNUC__)
-// template <> EIGEN_STRONG_INLINE __m128 ei_pmadd(const __m128&  a, const __m128&  b, const __m128&  c)
+// template <> EIGEN_STRONG_INLINE Packet4f ei_pmadd(const Packet4f&  a, const Packet4f&  b, const Packet4f&  c)
 // {
-//   __m128 res = b;
+//   Packet4f res = b;
 //   asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));
 //   return res;
 // }
-// EIGEN_STRONG_INLINE __m128i _mm_alignr_epi8(const __m128i&  a, const __m128i&  b, const int i)
+// EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i&  a, const Packet4i&  b, const int i)
 // {
-//   __m128i res = a;
+//   Packet4i res = a;
 //   asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i));
 //   return res;
 // }
@@ -225,9 +387,9 @@ template<> EIGEN_STRONG_INLINE __m128i ei_preduxp<__m128i>(const __m128i* vecs)
 #ifdef __SSSE3__
 // SSSE3 versions
 template<int Offset>
-struct ei_palign_impl<Offset,__m128>
+struct ei_palign_impl<Offset,Packet4f>
 {
-  EIGEN_STRONG_INLINE static void run(__m128& first, const __m128& second)
+  EIGEN_STRONG_INLINE static void run(Packet4f& first, const Packet4f& second)
  {
    if (Offset!=0)
      first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4));
@@ -235,9 +397,9 @@ struct ei_palign_impl<Offset,__m128>
 };

 template<int Offset>
-struct ei_palign_impl<Offset,__m128i>
+struct ei_palign_impl<Offset,Packet4i>
 {
-  EIGEN_STRONG_INLINE static void run(__m128i& first, const __m128i& second)
+  EIGEN_STRONG_INLINE static void run(Packet4i& first, const Packet4i& second)
  {
    if (Offset!=0)
      first = _mm_alignr_epi8(second,first, Offset*4);
@@ -245,9 +407,9 @@ struct ei_palign_impl<Offset,__m128i>
 };

 template<int Offset>
-struct ei_palign_impl<Offset,__m128d>
+struct ei_palign_impl<Offset,Packet2d>
 {
-  EIGEN_STRONG_INLINE static void run(__m128d& first, const __m128d& second)
+  EIGEN_STRONG_INLINE static void run(Packet2d& first, const Packet2d& second)
  {
    if (Offset==1)
      first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8));
@@ -256,9 +418,9 @@ struct ei_palign_impl<Offset,__m128d>
 #else
 // SSE2 versions
 template<int Offset>
-struct ei_palign_impl<Offset,__m128>
+struct ei_palign_impl<Offset,Packet4f>
 {
-  EIGEN_STRONG_INLINE static void run(__m128& first, const __m128& second)
+  EIGEN_STRONG_INLINE static void run(Packet4f& first, const Packet4f& second)
  {
    if (Offset==1)
    {
@@ -279,9 +441,9 @@ struct ei_palign_impl<Offset,__m128>
 };

 template<int Offset>
-struct ei_palign_impl<Offset,__m128i>
+struct ei_palign_impl<Offset,Packet4i>
 {
-  EIGEN_STRONG_INLINE static void run(__m128i& first, const __m128i& second)
+  EIGEN_STRONG_INLINE static void run(Packet4i& first, const Packet4i& second)
  {
    if (Offset==1)
    {
@@ -302,9 +464,9 @@ struct ei_palign_impl<Offset,__m128i>
 };

 template<int Offset>
-struct ei_palign_impl<Offset,__m128d>
+struct ei_palign_impl<Offset,Packet2d>
 {
-  EIGEN_STRONG_INLINE static void run(__m128d& first, const __m128d& second)
+  EIGEN_STRONG_INLINE static void run(Packet2d& first, const Packet2d& second)
  {
    if (Offset==1)
    {
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -0,0 +1,490 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_H
+
+template <int L2MemorySize,typename Scalar>
+struct ei_L2_block_traits {
+  enum {width = 8 * ei_meta_sqrt<L2MemorySize/(64*sizeof(Scalar))>::ret };
+};
+
+#ifndef EIGEN_EXTERN_INSTANTIATIONS
+
+template<typename Scalar>
+static void ei_cache_friendly_product(
+  int _rows, int _cols, int depth,
+  bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride,
+  bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride,
+  bool resRowMajor, Scalar* res, int resStride)
+{
+  const Scalar* EIGEN_RESTRICT lhs;
+  const Scalar* EIGEN_RESTRICT rhs;
+  int lhsStride, rhsStride, rows, cols;
+  bool lhsRowMajor;
+
+  if (resRowMajor)
+  {
+    lhs = _rhs;
+    rhs = _lhs;
+    lhsStride = _rhsStride;
+    rhsStride = _lhsStride;
+    cols = _rows;
+    rows = _cols;
+    lhsRowMajor = !_rhsRowMajor;
+    ei_assert(_lhsRowMajor);
+  }
+  else
+  {
+    lhs = _lhs;
+    rhs = _rhs;
+    lhsStride = _lhsStride;
+    rhsStride = _rhsStride;
+    rows = _rows;
+    cols = _cols;
+    lhsRowMajor = _lhsRowMajor;
+    ei_assert(!_rhsRowMajor);
+  }
+
+  typedef typename ei_packet_traits<Scalar>::type PacketType;
+
+
+
+#ifndef EIGEN_USE_ALT_PRODUCT
+
+  enum {
+    PacketSize = sizeof(PacketType)/sizeof(Scalar),
+    #if (defined __i386__)
+    HalfRegisterCount = 4,
+    #else
+    HalfRegisterCount = 8,
+    #endif
+
+    // register block size along the N direction
+    nr = HalfRegisterCount/2,
+
+    // register block size along the M direction
+    mr = 2 * PacketSize,
+
+    // max cache block size along the K direction
+    Max_kc = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width,
+
+    // max cache block size along the M direction
+    Max_mc = 2*Max_kc
+  };
+
+  int kc = std::min<int>(Max_kc,depth);  // cache block size along the K direction
+  int mc = std::min<int>(Max_mc,rows);   // cache block size along the M direction
+
+  Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
+  Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*PacketSize);
+
+  // number of columns which can be processed by packet of nr columns
+  int packet_cols = (cols/nr)*nr;
+
+  // GEMM_VAR1
+  for(int k2=0; k2<depth; k2+=kc)
+  {
+    const int actual_kc = std::min(k2+kc,depth)-k2;
+
+    // we have selected one row panel of rhs and one column panel of lhs
+    // pack rhs's panel into a sequential chunk of memory
+    // and expand each coeff to a constant packet for further reuse
+    {
+      int count = 0;
+      for(int j2=0; j2<packet_cols; j2+=nr)
+      {
+        const Scalar* b0 = &rhs[(j2+0)*rhsStride + k2];
+        const Scalar* b1 = &rhs[(j2+1)*rhsStride + k2];
+        const Scalar* b2 = &rhs[(j2+2)*rhsStride + k2];
+        const Scalar* b3 = &rhs[(j2+3)*rhsStride + k2];
+        for(int k=0; k<actual_kc; k++)
+        {
+          ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k]));
+          ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k]));
+          if (nr==4)
+          {
+            ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b2[k]));
+            ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b3[k]));
+          }
+          count += nr*PacketSize;
+        }
+      }
+    }
+
+    // => GEPP_VAR1
+    for(int i2=0; i2<rows; i2+=mc)
+    {
+      const int actual_mc = std::min(i2+mc,rows)-i2;
+
+      // We have selected a mc x kc block of lhs
+      // Let's pack it in a clever order for further purely sequential access
+      int count = 0;
+      const int peeled_mc = (actual_mc/mr)*mr;
+      if (lhsRowMajor)
+      {
+        for(int i=0; i<peeled_mc; i+=mr)
+          for(int k=0; k<actual_kc; k++)
+            for(int w=0; w<mr; w++)
+              blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride];
+        for(int i=peeled_mc; i<actual_mc; i++)
+        {
+          const Scalar* llhs = &lhs[(k2) + (i2+i)*lhsStride];
+          for(int k=0; k<actual_kc; k++)
+            blockA[count++] = llhs[k];
+        }
+      }
+      else
+      {
+        for(int i=0; i<peeled_mc; i+=mr)
+          for(int k=0; k<actual_kc; k++)
+            for(int w=0; w<mr; w++)
+              blockA[count++] = lhs[(k2+k)*lhsStride + i2+i+w];
+        for(int i=peeled_mc; i<actual_mc; i++)
+          for(int k=0; k<actual_kc; k++)
+            blockA[count++] = lhs[(k2+k)*lhsStride + i2+i];
+      }
+
+      // GEBP
+      // loops on each cache friendly block of the result/rhs
+      for(int j2=0; j2<packet_cols; j2+=nr)
+      {
+        // loops on each register blocking of lhs/res
+        const int peeled_mc = (actual_mc/mr)*mr;
+        for(int i=0; i<peeled_mc; i+=mr)
+        {
+          const Scalar* blA = &blockA[i*actual_kc];
+          #ifdef EIGEN_VECTORIZE_SSE
+          _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
+          #endif
+
+          // TODO move the res loads to the stores
+
+          // gets res block as register
+          PacketType C0, C1, C2, C3, C4, C5, C6, C7;
+                    C0 = ei_ploadu(&res[(j2+0)*resStride + i2 + i]);
+                    C1 = ei_ploadu(&res[(j2+1)*resStride + i2 + i]);
+          if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i2 + i]);
+          if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i2 + i]);
+                    C4 = ei_ploadu(&res[(j2+0)*resStride + i2 + i + PacketSize]);
+                    C5 = ei_ploadu(&res[(j2+1)*resStride + i2 + i + PacketSize]);
+          if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i2 + i + PacketSize]);
+          if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i2 + i + PacketSize]);
+
+          // performs "inner" product
+          // TODO let's check wether the flowing peeled loop could not be
+          //      optimized via optimal prefetching from one loop to the other
+          const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
+          const int peeled_kc = (actual_kc/4)*4;
+          for(int k=0; k<peeled_kc; k+=4)
+          {
+            PacketType B0, B1, B2, B3, A0, A1;
+
+                      A0 = ei_pload(&blA[0*PacketSize]);
+                      A1 = ei_pload(&blA[1*PacketSize]);
+                      B0 = ei_pload(&blB[0*PacketSize]);
+                      B1 = ei_pload(&blB[1*PacketSize]);
+                      C0 = ei_pmadd(B0, A0, C0);
+            if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
+                      C4 = ei_pmadd(B0, A1, C4);
+            if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
+                      B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
+                      C1 = ei_pmadd(B1, A0, C1);
+                      C5 = ei_pmadd(B1, A1, C5);
+                      B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
+            if(nr==4) C2 = ei_pmadd(B2, A0, C2);
+            if(nr==4) C6 = ei_pmadd(B2, A1, C6);
+            if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
+            if(nr==4) C3 = ei_pmadd(B3, A0, C3);
+                      A0 = ei_pload(&blA[2*PacketSize]);
+            if(nr==4) C7 = ei_pmadd(B3, A1, C7);
+                      A1 = ei_pload(&blA[3*PacketSize]);
+            if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
+                      C0 = ei_pmadd(B0, A0, C0);
+                      C4 = ei_pmadd(B0, A1, C4);
+                      B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
+                      C1 = ei_pmadd(B1, A0, C1);
+                      C5 = ei_pmadd(B1, A1, C5);
+                      B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
+            if(nr==4) C2 = ei_pmadd(B2, A0, C2);
+            if(nr==4) C6 = ei_pmadd(B2, A1, C6);
+            if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
+            if(nr==4) C3 = ei_pmadd(B3, A0, C3);
+                      A0 = ei_pload(&blA[4*PacketSize]);
+            if(nr==4) C7 = ei_pmadd(B3, A1, C7);
+                      A1 = ei_pload(&blA[5*PacketSize]);
+            if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
+
+                      C0 = ei_pmadd(B0, A0, C0);
+                      C4 = ei_pmadd(B0, A1, C4);
+                      B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
+                      C1 = ei_pmadd(B1, A0, C1);
+                      C5 = ei_pmadd(B1, A1, C5);
+                      B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
+            if(nr==4) C2 = ei_pmadd(B2, A0, C2);
+            if(nr==4) C6 = ei_pmadd(B2, A1, C6);
+            if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
+            if(nr==4) C3 = ei_pmadd(B3, A0, C3);
+                      A0 = ei_pload(&blA[6*PacketSize]);
+            if(nr==4) C7 = ei_pmadd(B3, A1, C7);
+                      A1 = ei_pload(&blA[7*PacketSize]);
+            if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
+                      C0 = ei_pmadd(B0, A0, C0);
+                      C4 = ei_pmadd(B0, A1, C4);
+                      C1 = ei_pmadd(B1, A0, C1);
+                      C5 = ei_pmadd(B1, A1, C5);
+            if(nr==4) C2 = ei_pmadd(B2, A0, C2);
+            if(nr==4) C6 = ei_pmadd(B2, A1, C6);
+            if(nr==4) C3 = ei_pmadd(B3, A0, C3);
+            if(nr==4) C7 = ei_pmadd(B3, A1, C7);
+
+            blB += 4*nr*PacketSize;
+            blA += 4*mr;
+          }
+          // process remaining peeled loop
+          for(int k=peeled_kc; k<actual_kc; k++)
+          {
+            PacketType B0, B1, B2, B3, A0, A1;
+
+                      A0 = ei_pload(&blA[0*PacketSize]);
+                      A1 = ei_pload(&blA[1*PacketSize]);
+                      B0 = ei_pload(&blB[0*PacketSize]);
+                      B1 = ei_pload(&blB[1*PacketSize]);
+                      C0 = ei_pmadd(B0, A0, C0);
+            if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
+                      C4 = ei_pmadd(B0, A1, C4);
+            if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
+                      C1 = ei_pmadd(B1, A0, C1);
+                      C5 = ei_pmadd(B1, A1, C5);
+            if(nr==4) C2 = ei_pmadd(B2, A0, C2);
+            if(nr==4) C6 = ei_pmadd(B2, A1, C6);
+            if(nr==4) C3 = ei_pmadd(B3, A0, C3);
+            if(nr==4) C7 = ei_pmadd(B3, A1, C7);
+
+            blB += nr*PacketSize;
+            blA += mr;
+          }
+
+                    ei_pstoreu(&res[(j2+0)*resStride + i2 + i], C0);
+                    ei_pstoreu(&res[(j2+1)*resStride + i2 + i], C1);
+          if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i], C2);
+          if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i], C3);
+                    ei_pstoreu(&res[(j2+0)*resStride + i2 + i + PacketSize], C4);
+                    ei_pstoreu(&res[(j2+1)*resStride + i2 + i + PacketSize], C5);
+          if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i + PacketSize], C6);
+          if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i + PacketSize], C7);
+        }
+        for(int i=peeled_mc; i<actual_mc; i++)
+        {
+          const Scalar* blA = &blockA[i*actual_kc];
+          #ifdef EIGEN_VECTORIZE_SSE
+          _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
+          #endif
+
+          // gets a 1 x nr res block as registers
+          Scalar C0(0), C1(0), C2(0), C3(0);
+          const Scalar* blB = &blockB[j2*actual_kc*PacketSize];
+          for(int k=0; k<actual_kc; k++)
+          {
+            Scalar B0, B1, B2, B3, A0;
+
+                      A0 =  blA[k];
+                      B0 =  blB[0*PacketSize];
+                      B1 =  blB[1*PacketSize];
+                      C0 += B0 * A0;
+            if(nr==4) B2 =  blB[2*PacketSize];
+            if(nr==4) B3 =  blB[3*PacketSize];
+                      C1 += B1 * A0;
+            if(nr==4) C2 += B2 * A0;
+            if(nr==4) C3 += B3 * A0;
+
+            blB += nr*PacketSize;
+          }
+          res[(j2+0)*resStride + i2 + i] += C0;
+          res[(j2+1)*resStride + i2 + i] += C1;
+          if(nr==4) res[(j2+2)*resStride + i2 + i] += C2;
+          if(nr==4) res[(j2+3)*resStride + i2 + i] += C3;
+        }
+      }
+      // remaining rhs/res columns (<nr)
+      for(int j2=packet_cols; j2<cols; j2++)
+      {
+        for(int i=0; i<actual_mc; i++)
+        {
+          Scalar c0 = res[(j2)*resStride + i2+i];
+          if (lhsRowMajor)
+            for(int k=0; k<actual_kc; k++)
+              c0 += lhs[(k2+k)+(i2+i)*lhsStride] * rhs[j2*rhsStride + k2 + k];
+          else
+            for(int k=0; k<actual_kc; k++)
+              c0 += lhs[(k2+k)*lhsStride + i2+i] * rhs[j2*rhsStride + k2 + k];
+          res[(j2)*resStride + i2+i] = c0;
+        }
+      }
+    }
+  }
+
+  ei_aligned_stack_delete(Scalar, blockA, kc*mc);
+  ei_aligned_stack_delete(Scalar, blockB, kc*cols*PacketSize);
+
+#else // alternate product from cylmor
+
+  enum {
+    PacketSize = sizeof(PacketType)/sizeof(Scalar),
+    #if (defined __i386__)
+    // i386 architecture provides only 8 xmm registers,
+    // so let's reduce the max number of rows processed at once.
+    MaxBlockRows = 4,
+    MaxBlockRows_ClampingMask = 0xFFFFFC,
+    #else
+    MaxBlockRows = 8,
+    MaxBlockRows_ClampingMask = 0xFFFFF8,
+    #endif
+    // maximal size of the blocks fitted in L2 cache
+    MaxL2BlockSize = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width
+  };
+
+  const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0));
+
+  const int remainingSize = depth % PacketSize;
+  const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries
+
+  const int l2BlockRows = MaxL2BlockSize > rows ? rows : 512;
+  const int l2BlockCols = MaxL2BlockSize > cols ? cols : 128;
+  const int l2BlockSize = MaxL2BlockSize > size ? size : 256;
+  const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize;
+  const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (size_t(rhs)%16!=0));
+
+  Scalar* EIGEN_RESTRICT block = new Scalar[l2BlockRows*size];
+//   for(int i=0; i<l2BlockRows*l2BlockSize; ++i)
+//     block[i] = 0;
+  // loops on each L2 cache friendly blocks of lhs
+  for(int l2k=0; l2k<depth; l2k+=l2BlockSize)
+  {
+    for(int l2i=0; l2i<rows; l2i+=l2BlockRows)
+    {
+      // We have selected a block of lhs
+      // Packs this block into 'block'
+      int count = 0;
+      for(int k=0; k<l2BlockSize; k+=MaxBlockRows)
+      {
+        for(int i=0; i<l2BlockRows; i+=2*PacketSize)
+          for (int w=0; w<MaxBlockRows; ++w)
+            for (int y=0; y<2*PacketSize; ++y)
+              block[count++] = lhs[(k+l2k+w)*lhsStride + l2i+i+ y];
+      }
+
+      // loops on each L2 cache firendly block of the result/rhs
+      for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
+      {
+        for(int k=0; k<l2BlockSize; k+=MaxBlockRows)
+        {
+          for(int j=0; j<l2BlockCols; ++j)
+          {
+            PacketType A0, A1, A2, A3, A4, A5, A6, A7;
+
+            // Load the packets from rhs and reorder them
+
+            // Here we need some vector reordering
+            // Right now its hardcoded to packets of 4 elements
+            const Scalar* lrhs = &rhs[(j+l2j)*rhsStride+(k+l2k)];
+            A0 = ei_pset1(lrhs[0]);
+            A1 = ei_pset1(lrhs[1]);
+            A2 = ei_pset1(lrhs[2]);
+            A3 = ei_pset1(lrhs[3]);
+            if (MaxBlockRows==8)
+            {
+              A4 = ei_pset1(lrhs[4]);
+              A5 = ei_pset1(lrhs[5]);
+              A6 = ei_pset1(lrhs[6]);
+              A7 = ei_pset1(lrhs[7]);
+            }
+
+            Scalar * lb = &block[l2BlockRows * k];
+            for(int i=0; i<l2BlockRows; i+=2*PacketSize)
+            {
+              PacketType R0, R1, L0, L1, T0, T1;
+
+              // We perform "cross products" of vectors to avoid
+              // reductions (horizontal ops) afterwards
+              T0 = ei_pload(&res[(j+l2j)*resStride+l2i+i]);
+              T1 = ei_pload(&res[(j+l2j)*resStride+l2i+i+PacketSize]);
+
+              R0 = ei_pload(&lb[0*PacketSize]);
+              L0 = ei_pload(&lb[1*PacketSize]);
+              R1 = ei_pload(&lb[2*PacketSize]);
+              L1 = ei_pload(&lb[3*PacketSize]);
+              T0 = ei_pmadd(R0, A0, T0);
+              T1 = ei_pmadd(L0, A0, T1);
+              R0 = ei_pload(&lb[4*PacketSize]);
+              L0 = ei_pload(&lb[5*PacketSize]);
+              T0 = ei_pmadd(R1, A1, T0);
+              T1 = ei_pmadd(L1, A1, T1);
+              R1 = ei_pload(&lb[6*PacketSize]);
+              L1 = ei_pload(&lb[7*PacketSize]);
+              T0 = ei_pmadd(R0, A2, T0);
+              T1 = ei_pmadd(L0, A2, T1);
+              if(MaxBlockRows==8)
+              {
+                R0 = ei_pload(&lb[8*PacketSize]);
+                L0 = ei_pload(&lb[9*PacketSize]);
+              }
+              T0 = ei_pmadd(R1, A3, T0);
+              T1 = ei_pmadd(L1, A3, T1);
+              if(MaxBlockRows==8)
+              {
+                R1 = ei_pload(&lb[10*PacketSize]);
+                L1 = ei_pload(&lb[11*PacketSize]);
+                T0 = ei_pmadd(R0, A4, T0);
+                T1 = ei_pmadd(L0, A4, T1);
+                R0 = ei_pload(&lb[12*PacketSize]);
+                L0 = ei_pload(&lb[13*PacketSize]);
+                T0 = ei_pmadd(R1, A5, T0);
+                T1 = ei_pmadd(L1, A5, T1);
+                R1 = ei_pload(&lb[14*PacketSize]);
+                L1 = ei_pload(&lb[15*PacketSize]);
+                T0 = ei_pmadd(R0, A6, T0);
+                T1 = ei_pmadd(L0, A6, T1);
+                T0 = ei_pmadd(R1, A7, T0);
+                T1 = ei_pmadd(L1, A7, T1);
+              }
+              lb += MaxBlockRows*2*PacketSize;
+
+              ei_pstore(&res[(j+l2j)*resStride+l2i+i], T0);
+              ei_pstore(&res[(j+l2j)*resStride+l2i+i+PacketSize], T1);
+            }
+          }
+        }
+      }
+    }
+  }
+  delete[] block;
+#endif
+
+
+}
+
+#endif // EIGEN_EXTERN_INSTANTIATIONS
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_H
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -22,327 +22,8 @@
 // License and a copy of the GNU General Public License along with
 // Eigen. If not, see <http://www.gnu.org/licenses/>.

-#ifndef EIGEN_CACHE_FRIENDLY_PRODUCT_H
-#define EIGEN_CACHE_FRIENDLY_PRODUCT_H
-
-template <int L2MemorySize,typename Scalar>
-struct ei_L2_block_traits {
-  enum {width = 8 * ei_meta_sqrt<L2MemorySize/(64*sizeof(Scalar))>::ret };
-};
-
-#ifndef EIGEN_EXTERN_INSTANTIATIONS
-
-template<typename Scalar>
-static void ei_cache_friendly_product(
-  int _rows, int _cols, int depth,
-  bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride,
-  bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride,
-  bool resRowMajor, Scalar* res, int resStride)
-{
-  const Scalar* EIGEN_RESTRICT lhs;
-  const Scalar* EIGEN_RESTRICT rhs;
-  int lhsStride, rhsStride, rows, cols;
-  bool lhsRowMajor;
-
-  if (resRowMajor)
-  {
-    lhs = _rhs;
-    rhs = _lhs;
-    lhsStride = _rhsStride;
-    rhsStride = _lhsStride;
-    cols = _rows;
-    rows = _cols;
-    lhsRowMajor = !_rhsRowMajor;
-    ei_assert(_lhsRowMajor);
-  }
-  else
-  {
-    lhs = _lhs;
-    rhs = _rhs;
-    lhsStride = _lhsStride;
-    rhsStride = _rhsStride;
-    rows = _rows;
-    cols = _cols;
-    lhsRowMajor = _lhsRowMajor;
-    ei_assert(!_rhsRowMajor);
-  }
-
-  typedef typename ei_packet_traits<Scalar>::type PacketType;
-
-  enum {
-    PacketSize = sizeof(PacketType)/sizeof(Scalar),
-    #if (defined __i386__)
-    // i386 architecture provides only 8 xmm registers,
-    // so let's reduce the max number of rows processed at once.
-    MaxBlockRows = 4,
-    MaxBlockRows_ClampingMask = 0xFFFFFC,
-    #else
-    MaxBlockRows = 8,
-    MaxBlockRows_ClampingMask = 0xFFFFF8,
-    #endif
-    // maximal size of the blocks fitted in L2 cache
-    MaxL2BlockSize = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width
-  };
-
-  const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0));
-
-  const int remainingSize = depth % PacketSize;
-  const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries
-  const int l2BlockRows = MaxL2BlockSize > rows ? rows : MaxL2BlockSize;
-  const int l2BlockCols = MaxL2BlockSize > cols ? cols : MaxL2BlockSize;
-  const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize;
-  const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize;
-  const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (size_t(rhs)%16!=0));
-  Scalar* EIGEN_RESTRICT block = 0;
-  const int allocBlockSize = l2BlockRows*size;
-  block = ei_aligned_stack_new(Scalar, allocBlockSize);
-  Scalar* EIGEN_RESTRICT rhsCopy
-    = ei_aligned_stack_new(Scalar, l2BlockSizeAligned*l2BlockSizeAligned);
-
-  // loops on each L2 cache friendly blocks of the result
-  for(int l2i=0; l2i<rows; l2i+=l2BlockRows)
-  {
-    const int l2blockRowEnd = std::min(l2i+l2BlockRows, rows);
-    const int l2blockRowEndBW = l2blockRowEnd & MaxBlockRows_ClampingMask;    // end of the rows aligned to bw
-    const int l2blockRemainingRows = l2blockRowEnd - l2blockRowEndBW;         // number of remaining rows
-    //const int l2blockRowEndBWPlusOne = l2blockRowEndBW + (l2blockRemainingRows?0:MaxBlockRows);
-
-    // build a cache friendly blocky matrix
-    int count = 0;
-
-    // copy l2blocksize rows of m_lhs to blocks of ps x bw
-    for(int l2k=0; l2k<size; l2k+=l2BlockSize)
-    {
-      const int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
-
-      for (int i = l2i; i<l2blockRowEndBW/*PlusOne*/; i+=MaxBlockRows)
-      {
-        // TODO merge the "if l2blockRemainingRows" using something like:
-        // const int blockRows = std::min(i+MaxBlockRows, rows) - i;
-
-        for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
-        {
-          // TODO write these loops using meta unrolling
-          // negligible for large matrices but useful for small ones
-          if (lhsRowMajor)
-          {
-            for (int w=0; w<MaxBlockRows; ++w)
-              for (int s=0; s<PacketSize; ++s)
-                block[count++] = lhs[(i+w)*lhsStride + (k+s)];
-          }
-          else
-          {
-            for (int w=0; w<MaxBlockRows; ++w)
-              for (int s=0; s<PacketSize; ++s)
-                block[count++] = lhs[(i+w) + (k+s)*lhsStride];
-          }
-        }
-      }
-      if (l2blockRemainingRows>0)
-      {
-        for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
-        {
-          if (lhsRowMajor)
-          {
-            for (int w=0; w<l2blockRemainingRows; ++w)
-              for (int s=0; s<PacketSize; ++s)
-                block[count++] = lhs[(l2blockRowEndBW+w)*lhsStride + (k+s)];
-          }
-          else
-          {
-            for (int w=0; w<l2blockRemainingRows; ++w)
-              for (int s=0; s<PacketSize; ++s)
-                block[count++] = lhs[(l2blockRowEndBW+w) + (k+s)*lhsStride];
-          }
-        }
-      }
-    }
-
-    for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
-    {
-      int l2blockColEnd = std::min(l2j+l2BlockCols, cols);
-
-      for(int l2k=0; l2k<size; l2k+=l2BlockSize)
-      {
-        // acumulate bw rows of lhs time a single column of rhs to a bw x 1 block of res
-        int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
-
-        // if not aligned, copy the rhs block
-        if (needRhsCopy)
-          for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
-          {
-            ei_internal_assert(l2BlockSizeAligned*(l1j-l2j)+(l2blockSizeEnd-l2k) < l2BlockSizeAligned*l2BlockSizeAligned);
-            memcpy(rhsCopy+l2BlockSizeAligned*(l1j-l2j),&(rhs[l1j*rhsStride+l2k]),(l2blockSizeEnd-l2k)*sizeof(Scalar));
-          }
-
-        // for each bw x 1 result's block
-        for(int l1i=l2i; l1i<l2blockRowEndBW; l1i+=MaxBlockRows)
-        {
-          int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*MaxBlockRows;
-          const Scalar* EIGEN_RESTRICT localB = &block[offsetblock];
-          
-          for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
-          {
-            const Scalar* EIGEN_RESTRICT rhsColumn;
-            if (needRhsCopy)
-              rhsColumn = &(rhsCopy[l2BlockSizeAligned*(l1j-l2j)-l2k]);
-            else
-              rhsColumn = &(rhs[l1j*rhsStride]);
-
-            PacketType dst[MaxBlockRows];
-            dst[3] = dst[2] = dst[1] = dst[0] = ei_pset1(Scalar(0.));
-            if (MaxBlockRows==8)
-              dst[7] = dst[6] = dst[5] = dst[4] = dst[0];
-
-            PacketType tmp;
-
-            for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
-            {
-              tmp = ei_ploadu(&rhsColumn[k]);
-              PacketType A0, A1, A2, A3, A4, A5;
-              A0 = ei_pload(localB + k*MaxBlockRows);
-              A1 = ei_pload(localB + k*MaxBlockRows+1*PacketSize);
-              A2 = ei_pload(localB + k*MaxBlockRows+2*PacketSize);
-              A3 = ei_pload(localB + k*MaxBlockRows+3*PacketSize);
-              if (MaxBlockRows==8) A4 = ei_pload(localB + k*MaxBlockRows+4*PacketSize);
-              if (MaxBlockRows==8) A5 = ei_pload(localB + k*MaxBlockRows+5*PacketSize);
-              dst[0] = ei_pmadd(tmp, A0, dst[0]);
-              if (MaxBlockRows==8) A0 = ei_pload(localB + k*MaxBlockRows+6*PacketSize);
-              dst[1] = ei_pmadd(tmp, A1, dst[1]);
-              if (MaxBlockRows==8) A1 = ei_pload(localB + k*MaxBlockRows+7*PacketSize);
-              dst[2] = ei_pmadd(tmp, A2, dst[2]);
-              dst[3] = ei_pmadd(tmp, A3, dst[3]);
-              if (MaxBlockRows==8)
-              {
-                dst[4] = ei_pmadd(tmp, A4, dst[4]);
-                dst[5] = ei_pmadd(tmp, A5, dst[5]);
-                dst[6] = ei_pmadd(tmp, A0, dst[6]);
-                dst[7] = ei_pmadd(tmp, A1, dst[7]);
-              }
-            }
-
-            Scalar* EIGEN_RESTRICT localRes = &(res[l1i + l1j*resStride]);
-
-            if (PacketSize>1 && resIsAligned)
-            {
-              // the result is aligned: let's do packet reduction
-              ei_pstore(&(localRes[0]), ei_padd(ei_pload(&(localRes[0])), ei_preduxp(&dst[0])));
-              if (PacketSize==2)
-                ei_pstore(&(localRes[2]), ei_padd(ei_pload(&(localRes[2])), ei_preduxp(&(dst[2]))));
-              if (MaxBlockRows==8)
-              {
-                ei_pstore(&(localRes[4]), ei_padd(ei_pload(&(localRes[4])), ei_preduxp(&(dst[4]))));
-                if (PacketSize==2)
-                  ei_pstore(&(localRes[6]), ei_padd(ei_pload(&(localRes[6])), ei_preduxp(&(dst[6]))));
-              }
-            }
-            else
-            {
-              // not aligned => per coeff packet reduction
-              localRes[0] += ei_predux(dst[0]);
-              localRes[1] += ei_predux(dst[1]);
-              localRes[2] += ei_predux(dst[2]);
-              localRes[3] += ei_predux(dst[3]);
-              if (MaxBlockRows==8)
-              {
-                localRes[4] += ei_predux(dst[4]);
-                localRes[5] += ei_predux(dst[5]);
-                localRes[6] += ei_predux(dst[6]);
-                localRes[7] += ei_predux(dst[7]);
-              }
-            }
-          }
-        }
-        if (l2blockRemainingRows>0)
-        {
-          int offsetblock = l2k * (l2blockRowEnd-l2i) + (l2blockRowEndBW-l2i)*(l2blockSizeEnd-l2k) - l2k*l2blockRemainingRows;
-          const Scalar* localB = &block[offsetblock];
-
-          for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
-          {
-            const Scalar* EIGEN_RESTRICT rhsColumn;
-            if (needRhsCopy)
-              rhsColumn = &(rhsCopy[l2BlockSizeAligned*(l1j-l2j)-l2k]);
-            else
-              rhsColumn = &(rhs[l1j*rhsStride]);
-
-            PacketType dst[MaxBlockRows];
-            dst[3] = dst[2] = dst[1] = dst[0] = ei_pset1(Scalar(0.));
-            if (MaxBlockRows==8)
-              dst[7] = dst[6] = dst[5] = dst[4] = dst[0];
-
-            // let's declare a few other temporary registers
-            PacketType tmp;
-
-            for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
-            {
-              tmp = ei_pload(&rhsColumn[k]);
-
-                                           dst[0] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows             ])), dst[0]);
-              if (l2blockRemainingRows>=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+  PacketSize])), dst[1]);
-              if (l2blockRemainingRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+2*PacketSize])), dst[2]);
-              if (l2blockRemainingRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+3*PacketSize])), dst[3]);
-              if (MaxBlockRows==8)
-              {
-                if (l2blockRemainingRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+4*PacketSize])), dst[4]);
-                if (l2blockRemainingRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+5*PacketSize])), dst[5]);
-                if (l2blockRemainingRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+6*PacketSize])), dst[6]);
-                if (l2blockRemainingRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+7*PacketSize])), dst[7]);
-              }
-            }
-
-            Scalar* EIGEN_RESTRICT localRes = &(res[l2blockRowEndBW + l1j*resStride]);
-
-            // process the remaining rows once at a time
-                                         localRes[0] += ei_predux(dst[0]);
-            if (l2blockRemainingRows>=2) localRes[1] += ei_predux(dst[1]);
-            if (l2blockRemainingRows>=3) localRes[2] += ei_predux(dst[2]);
-            if (l2blockRemainingRows>=4) localRes[3] += ei_predux(dst[3]);
-            if (MaxBlockRows==8)
-            {
-              if (l2blockRemainingRows>=5) localRes[4] += ei_predux(dst[4]);
-              if (l2blockRemainingRows>=6) localRes[5] += ei_predux(dst[5]);
-              if (l2blockRemainingRows>=7) localRes[6] += ei_predux(dst[6]);
-              if (l2blockRemainingRows>=8) localRes[7] += ei_predux(dst[7]);
-            }
-
-          }
-        }
-      }
-    }
-  }
-  if (PacketSize>1 && remainingSize)
-  {
-    if (lhsRowMajor)
-    {
-      for (int j=0; j<cols; ++j)
-        for (int i=0; i<rows; ++i)
-        {
-          Scalar tmp = lhs[i*lhsStride+size] * rhs[j*rhsStride+size];
-          // FIXME this loop get vectorized by the compiler !
-          for (int k=1; k<remainingSize; ++k)
-            tmp += lhs[i*lhsStride+size+k] * rhs[j*rhsStride+size+k];
-          res[i+j*resStride] += tmp;
-        }
-    }
-    else
-    {
-      for (int j=0; j<cols; ++j)
-        for (int i=0; i<rows; ++i)
-        {
-          Scalar tmp = lhs[i+size*lhsStride] * rhs[j*rhsStride+size];
-          for (int k=1; k<remainingSize; ++k)
-            tmp += lhs[i+(size+k)*lhsStride] * rhs[j*rhsStride+size+k];
-          res[i+j*resStride] += tmp;
-        }
-    }
-  }
-
-  ei_aligned_stack_delete(Scalar, block, allocBlockSize);
-  ei_aligned_stack_delete(Scalar, rhsCopy, l2BlockSizeAligned*l2BlockSizeAligned);
-}
-
-#endif // EIGEN_EXTERN_INSTANTIATIONS
+#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
+#define EIGEN_GENERAL_MATRIX_VECTOR_H

 /* Optimized col-major matrix * vector product:
 * This algorithm processes 4 columns at onces that allows to both reduce
@@ -495,7 +176,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_colmajor_times_vector(

    /* process remaining coeffs (or all if there is no explicit vectorization) */
    for (int j=alignedSize; j<size; ++j)
-      res[j] += ei_pfirst(ptmp0)*lhs0[j] + ei_pfirst(ptmp1)*lhs1[j] + ei_pfirst(ptmp2)*lhs2[j] + ei_pfirst(ptmp3)*lhs3[j];
+	  res[j] += ei_pfirst(ptmp0)*lhs0[j] + ei_pfirst(ptmp1)*lhs1[j] + ei_pfirst(ptmp2)*lhs2[j] + ei_pfirst(ptmp3)*lhs3[j];
  }

  // process remaining first and last columns (at most columnsAtOnce-1)
@@ -750,4 +431,4 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
  #undef _EIGEN_ACCUMULATE_PACKETS
 }

-#endif // EIGEN_CACHE_FRIENDLY_PRODUCT_H
+#endif // EIGEN_GENERAL_MATRIX_VECTOR_H
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -0,0 +1,146 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_H
+#define EIGEN_SELFADJOINT_MATRIX_VECTOR_H
+
+template<bool Conjugate> struct ei_conj_if {
+  template<typename Scalar> Scalar operator() (const Scalar& x) const { return ei_conj(x); }
+};
+
+template<> struct ei_conj_if<false> {
+  template<typename Scalar> Scalar& operator() (Scalar& x) const { return x; }
+};
+
+/* Optimized col-major selfadjoint matrix * vector product:
+ * This algorithm processes 2 columns at onces that allows to both reduce
+ * the number of load/stores of the result by a factor 2 and to reduce
+ * the instruction dependency.
+ */
+template<typename Scalar, int StorageOrder, int UpLo>
+static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector(
+  int size,
+  const Scalar* lhs, int lhsStride,
+  const Scalar* rhs, //int rhsIncr,
+  Scalar* res)
+{
+  typedef typename ei_packet_traits<Scalar>::type Packet;
+  const int PacketSize = sizeof(Packet)/sizeof(Scalar);
+
+  enum {
+    IsRowMajor = StorageOrder==RowMajorBit ? 1 : 0,
+    IsLower = UpLo == LowerTriangularBit ? 1 : 0,
+    FirstTriangular = IsRowMajor == IsLower
+  };
+
+  ei_conj_if<NumTraits<Scalar>::IsComplex && IsRowMajor> conj0;
+  ei_conj_if<NumTraits<Scalar>::IsComplex && !IsRowMajor> conj1;
+
+  for (int i=0;i<size;i++)
+    res[i] = 0;
+
+  int bound = std::max(0,size-8) & 0xfffffffE;
+  if (FirstTriangular)
+    bound = size - bound;
+
+  for (int j=FirstTriangular ? bound : 0;
+       j<(FirstTriangular ? size : bound);j+=2)
+  {
+    register const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
+    register const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride;
+
+    Scalar t0 = rhs[j];
+    Packet ptmp0 = ei_pset1(t0);
+    Scalar t1 = rhs[j+1];
+    Packet ptmp1 = ei_pset1(t1);
+
+    Scalar t2 = 0;
+    Packet ptmp2 = ei_pset1(t2);
+    Scalar t3 = 0;
+    Packet ptmp3 = ei_pset1(t3);
+
+    size_t starti = FirstTriangular ? 0 : j+2;
+    size_t endi   = FirstTriangular ? j : size;
+    size_t alignedEnd = starti;
+    size_t alignedStart = (starti) + ei_alignmentOffset(&res[starti], endi-starti);
+    alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
+
+    res[j]   += t0 * conj0(A0[j]);
+    if(FirstTriangular)
+    {
+      res[j+1]   += t1 * conj0(A1[j+1]);
+      res[j] += t1 * conj0(A1[j]);
+      t3 += conj1(A1[j]) * rhs[j];
+    }
+    else
+    {
+      res[j+1] += t0 * conj0(A0[j+1]) + t1 * conj0(A1[j+1]);
+      t2 += conj1(A0[j+1]) * rhs[j+1];
+    }
+
+    for (size_t i=starti; i<alignedStart; ++i)
+    {
+      res[i] += t0 * A0[i] + t1 * A1[i];
+      t2 += ei_conj(A0[i]) * rhs[i];
+      t3 += ei_conj(A1[i]) * rhs[i];
+    }
+    for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize)
+    {
+      Packet A0i = ei_ploadu(&A0[i]);
+      Packet A1i = ei_ploadu(&A1[i]);
+      Packet Bi = ei_ploadu(&rhs[i]); // FIXME should be aligned in most cases
+      Packet Xi = ei_pload(&res[i]);
+
+      Xi = ei_padd(ei_padd(Xi, ei_pmul(ptmp0, conj0(A0i))), ei_pmul(ptmp1, conj0(A1i)));
+      ptmp2 = ei_padd(ptmp2, ei_pmul(conj1(A0i), Bi));
+      ptmp3 = ei_padd(ptmp3, ei_pmul(conj1(A1i), Bi));
+      ei_pstore(&res[i],Xi);
+    }
+    for (size_t i=alignedEnd; i<endi; i++)
+    {
+      res[i] += t0 * conj0(A0[i]) + t1 * conj0(A1[i]);
+      t2 += conj1(A0[i]) * rhs[i];
+      t3 += conj1(A1[i]) * rhs[i];
+    }
+
+    res[j]   += t2 + ei_predux(ptmp2);
+    res[j+1] += t3 + ei_predux(ptmp3);
+  }
+  for (int j=FirstTriangular ? 0 : bound;j<(FirstTriangular ? bound : size);j++)
+  {
+    register const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
+
+    Scalar t1 = rhs[j];
+    Scalar t2 = 0;
+    res[j] += t1 * conj0(A0[j]);
+    for (int i=FirstTriangular ? 0 : j+1; i<(FirstTriangular ? j : size); i++) {
+      res[i] += t1 * conj0(A0[i]);
+      t2 += conj1(A0[i]) * rhs[i];
+    }
+    res[j] += t2;
+  }
+}
+
+
+#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H
--- a/Eigen/src/Core/util/CMakeLists.txt
+++ b/Eigen/src/Core/util/CMakeLists.txt
@@ -2,5 +2,5 @@ FILE(GLOB Eigen_Core_util_SRCS "*.h")

 INSTALL(FILES 
  ${Eigen_Core_util_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/util
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/util COMPONENT Devel
  )
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -30,17 +30,19 @@
  * stored in some runtime variable.
  *
  * Explanation for the choice of this value:
-  * - It should be positive and larger than any reasonable compile-time-fixed number of rows or columns.
+  * - It should be positive and larger than the number of entries in any reasonable fixed-size matrix.
  *   This allows to simplify many compile-time conditions throughout Eigen.
  * - It should be smaller than the sqrt of INT_MAX. Indeed, we often multiply a number of rows with a number
  *   of columns in order to compute a number of coefficients. Even if we guard that with an "if" checking whether
  *   the values are Dynamic, we still get a compiler warning "integer overflow". So the only way to get around
  *   it would be a meta-selector. Doing this everywhere would reduce code readability and lenghten compilation times.
  *   Also, disabling compiler warnings for integer overflow, sounds like a bad idea.
+  * - It should be a prime number, because for example the old value 10000 led to bugs with 100x100 matrices.
  *
-  * If you wish to port Eigen to a platform where sizeof(int)==2, it is perfectly possible to set Dynamic to, say, 100.
+  * If you wish to port Eigen to a platform where sizeof(int)==2, it is perfectly possible to set Dynamic to, say, 97.
+  * However, changing the value of Dynamic breaks the ABI, as Dynamic is often used as a template parameter for Matrix.
  */
-const int Dynamic = 10000;
+const int Dynamic = 33331;

 /** This value means +Infinity; it is currently used only as the p parameter to MatrixBase::lpNorm<int>().
  * The value Infinity there means the L-infinity norm.
@@ -138,7 +140,7 @@ const unsigned int LinearAccessBit = 0x10;
  * First, references to the coefficients must be available through coeffRef(int, int). This rules out read-only
  * expressions whose coefficients are computed on demand by coeff(int, int). Second, the memory layout of the
  * array of coefficients must be exactly the natural one suggested by rows(), cols(), stride(), and the RowMajorBit.
-  * This rules out expressions such as DiagonalCoeffs, whose coefficients, though referencable, do not have
+  * This rules out expressions such as Diagonal, whose coefficients, though referencable, do not have
  * such a regular memory layout.
  */
 const unsigned int DirectAccessBit = 0x20;
@@ -184,23 +186,30 @@ const unsigned int HereditaryBits = RowMajorBit
                                  | EvalBeforeAssigningBit
                                  | SparseBit;

-// Possible values for the Mode parameter of part() and of extract()
+// diagonal means both upper and lower triangular
+const unsigned DiagonalBits = UpperTriangularBit | LowerTriangularBit;
+    
+// Possible values for the Mode parameter of part()
 const unsigned int UpperTriangular = UpperTriangularBit;
 const unsigned int StrictlyUpperTriangular = UpperTriangularBit | ZeroDiagBit;
 const unsigned int LowerTriangular = LowerTriangularBit;
 const unsigned int StrictlyLowerTriangular = LowerTriangularBit | ZeroDiagBit;
 const unsigned int SelfAdjoint = SelfAdjointBit;
-
-// additional possible values for the Mode parameter of extract()
 const unsigned int UnitUpperTriangular = UpperTriangularBit | UnitDiagBit;
 const unsigned int UnitLowerTriangular = LowerTriangularBit | UnitDiagBit;
-const unsigned int Diagonal = UpperTriangular | LowerTriangular;
+
+template<typename T> struct ei_is_diagonal
+{
+  enum {
+    ret = ( (unsigned int)(T::Flags) & DiagonalBits ) == DiagonalBits
+  };
+};

 enum { Aligned, Unaligned };
 enum { ForceAligned, AsRequested };
 enum { ConditionalJumpCost = 5 };
 enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };
-enum DirectionType { Vertical, Horizontal };
+enum DirectionType { Vertical, Horizontal, BothDirections };
 enum ProductEvaluationMode { NormalProduct, CacheFriendlyProduct, DiagonalProduct, SparseTimeSparseProduct, SparseTimeDenseProduct, DenseTimeSparseProduct };

 enum {
@@ -225,11 +234,11 @@ enum {
 enum {
  ColMajor = 0,
  RowMajor = 0x1,  // it is only a coincidence that this is equal to RowMajorBit -- don't rely on that
+  /** \internal Align the matrix itself if it is vectorizable fixed-size */
+  AutoAlign = 0,
  /** \internal Don't require alignment for the matrix itself (the array of coefficients, if dynamically allocated, may still be
                requested to be aligned) */
-  DontAlign = 0,
-  /** \internal Align the matrix itself if it is vectorizable fixed-size */
-  AutoAlign = 0x2
+  DontAlign = 0x2
 };

 enum {
@@ -239,4 +248,23 @@ enum {
  HasDirectAccess = DirectAccessBit
 };

+enum TransformTraits {
+  Isometry      = 0x1,
+  Affine        = 0x2,
+  AffineCompact = 0x10 | Affine,
+  Projective    = 0x20
+};
+
+const int EiArch_Generic = 0x0;
+const int EiArch_SSE     = 0x1;
+const int EiArch_AltiVec = 0x2;
+
+#if defined EIGEN_VECTORIZE_SSE
+  const int EiArch = EiArch_SSE;
+#elif defined EIGEN_VECTORIZE_ALTIVEC
+  const int EiArch = EiArch_AltiVec;
+#else
+  const int EiArch = EiArch_Generic;
+#endif
+
 #endif // EIGEN_CONSTANTS_H
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -45,14 +45,17 @@ template<typename NullaryOp, typename MatrixType>         class CwiseNullaryOp;
 template<typename UnaryOp,   typename MatrixType>         class CwiseUnaryOp;
 template<typename BinaryOp,  typename Lhs, typename Rhs>  class CwiseBinaryOp;
 template<typename Lhs, typename Rhs, int ProductMode> class Product;
-template<typename CoeffsVectorType> class DiagonalMatrix;
-template<typename MatrixType> class DiagonalCoeffs;
+template<typename CoeffsVectorType, typename Derived> class DiagonalMatrixBase;
+template<typename CoeffsVectorType> class DiagonalMatrixWrapper;
+template<typename _Scalar, int _Size> class DiagonalMatrix;
+template<typename MatrixType, int Index> class Diagonal;
 template<typename MatrixType, int PacketAccess = AsRequested> class Map;
 template<typename MatrixType, unsigned int Mode> class Part;
 template<typename MatrixType, unsigned int Mode> class Extract;
 template<typename ExpressionType> class Cwise;
 template<typename ExpressionType> class WithFormat;
 template<typename MatrixType> struct CommaInitializer;
+template<typename Functor, typename EvalType> class ReturnByValue;


 template<typename Lhs, typename Rhs> struct ei_product_mode;
@@ -87,6 +90,8 @@ template<typename Scalar> struct ei_scalar_add_op;
 template<typename Scalar> struct ei_scalar_constant_op;
 template<typename Scalar> struct ei_scalar_identity_op;

+template<typename Scalar1,typename Scalar2> struct ei_scalar_multiple2_op;
+
 struct IOFormat;

 template<typename Scalar>
@@ -100,8 +105,11 @@ void ei_cache_friendly_product(
 template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType> class Select;
 template<typename MatrixType, typename BinaryOp, int Direction> class PartialReduxExpr;
 template<typename ExpressionType, int Direction> class PartialRedux;
+template<typename MatrixType,int RowFactor,int ColFactor> class Replicate;
+template<typename MatrixType, int Direction = BothDirections> class Reverse;

 template<typename MatrixType> class LU;
+template<typename MatrixType> class PartialLU;
 template<typename MatrixType> class QR;
 template<typename MatrixType> class SVD;
 template<typename MatrixType> class LLT;
@@ -113,11 +121,12 @@ template<typename Lhs, typename Rhs> class Cross;
 template<typename Scalar> class Quaternion;
 template<typename Scalar> class Rotation2D;
 template<typename Scalar> class AngleAxis;
-template<typename Scalar,int Dim> class Transform;
+template<typename Scalar,int Dim,int Mode=Affine> class Transform;
 template <typename _Scalar, int _AmbientDim> class ParametrizedLine;
 template <typename _Scalar, int _AmbientDim> class Hyperplane;
 template<typename Scalar,int Dim> class Translation;
-template<typename Scalar,int Dim> class Scaling;
+template<typename Scalar> class UniformScaling;
+template<typename MatrixType,int Direction> class Homogeneous;

 // Sparse module:
 template<typename Lhs, typename Rhs, int ProductMode> class SparseProduct;
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -30,7 +30,7 @@

 #define EIGEN_WORLD_VERSION 2
 #define EIGEN_MAJOR_VERSION 0
-#define EIGEN_MINOR_VERSION 2
+#define EIGEN_MINOR_VERSION 52

 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                      (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
@@ -41,34 +41,41 @@
 // because extra memory must be allocated for bookkeeping).
 // if the compiler is not GNUC, just cross fingers that the architecture isn't too exotic, because we don't want
 // to keep track of all the different preprocessor symbols for all compilers.
-#if !defined(__GNUC__) || defined(__i386__) || defined(__x86_64__) || defined(__ppc__) || defined(__ia64__)
+#if (!defined(__GNUC__)) || defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__ia64__)
  #define EIGEN_ARCH_WANTS_ALIGNMENT 1
 #else
+  #define EIGEN_ARCH_WANTS_ALIGNMENT 0
+#endif
+
+// EIGEN_ALIGN is the true test whether we want to align or not. It takes into account both the user choice to explicitly disable
+// alignment (EIGEN_DONT_ALIGN) and the architecture config (EIGEN_ARCH_WANTS_ALIGNMENT). Henceforth, only EIGEN_ALIGN should be used.
+#if EIGEN_ARCH_WANTS_ALIGNMENT && !defined(EIGEN_DONT_ALIGN)
+  #define EIGEN_ALIGN 1
+#else
+  #define EIGEN_ALIGN 0
  #ifdef EIGEN_VECTORIZE
    #error Vectorization enabled, but the architecture is not listed among those for which we require 16 byte alignment. If you added vectorization for another architecture, you also need to edit this list.
  #endif
-  #define EIGEN_ARCH_WANTS_ALIGNMENT 0
  #ifndef EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
    #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
  #endif
 #endif

-
 #ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
 #define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION RowMajor
 #else
 #define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION ColMajor
 #endif

-/** \internal  Defines the maximal loop size to enable meta unrolling of loops.
-  *            Note that the value here is expressed in Eigen's own notion of "number of FLOPS",
-  *            it does not correspond to the number of iterations or the number of instructions
+/** Defines the maximal loop size to enable meta unrolling of loops.
+  * Note that the value here is expressed in Eigen's own notion of "number of FLOPS",
+  * it does not correspond to the number of iterations or the number of instructions
  */
 #ifndef EIGEN_UNROLLING_LIMIT
 #define EIGEN_UNROLLING_LIMIT 100
 #endif

-/** \internal Define the maximal size in Bytes of blocks fitting in CPU cache.
+/** Defines the maximal size in Bytes of blocks fitting in CPU cache.
  * The current value is set to generate blocks of 256x256 for float
  *
  * Typically for a single-threaded application you would set that to 25% of the size of your CPU caches in bytes
@@ -77,9 +84,13 @@
 #define EIGEN_TUNE_FOR_CPU_CACHE_SIZE (sizeof(float)*256*256)
 #endif

-// FIXME this should go away quickly
-#ifdef EIGEN_TUNE_FOR_L2_CACHE_SIZE
-#error EIGEN_TUNE_FOR_L2_CACHE_SIZE is now called EIGEN_TUNE_FOR_CPU_CACHE_SIZE.
+/** Allows to disable some optimizations which might affect the accuracy of the result.
+  * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
+  * They currently include:
+  *   - single precision Cwise::sin() and Cwise::cos() when SSE vectorization is enabled. 
+  */
+#ifndef EIGEN_FAST_MATH
+#define EIGEN_FAST_MATH 1
 #endif

 #define USING_PART_OF_NAMESPACE_EIGEN \
@@ -159,13 +170,25 @@ using Eigen::ei_cos;
 #define EIGEN_DEPRECATED
 #endif

+#if (defined __GNUC__)
+#define EIGEN_UNUSED __attribute__((unused))
+#else
+#define EIGEN_UNUSED
+#endif
+
+#if (defined __GNUC__)
+#define EIGEN_ASM_COMMENT(X)  asm("#"X)
+#else
+#define EIGEN_ASM_COMMENT(X)
+#endif
+
 /* EIGEN_ALIGN_128 forces data to be 16-byte aligned, EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
 * so that vectorization doesn't affect binary compatibility.
 *
 * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
 * vectorized and non-vectorized code.
 */
-#if !EIGEN_ARCH_WANTS_ALIGNMENT
+#if !EIGEN_ALIGN
 #define EIGEN_ALIGN_128
 #elif (defined __GNUC__)
 #define EIGEN_ALIGN_128 __attribute__((aligned(16)))
@@ -178,13 +201,24 @@ using Eigen::ei_cos;
 #define EIGEN_RESTRICT __restrict

 #ifndef EIGEN_STACK_ALLOCATION_LIMIT
-#define EIGEN_STACK_ALLOCATION_LIMIT 16000000
+#define EIGEN_STACK_ALLOCATION_LIMIT 1000000
 #endif

 #ifndef EIGEN_DEFAULT_IO_FORMAT
 #define EIGEN_DEFAULT_IO_FORMAT Eigen::IOFormat()
 #endif

+// just an empty macro !
+#define EIGEN_EMPTY
+
+// concatenate two tokens
+#define EIGEN_CAT2(a,b) a ## b
+#define EIGEN_CAT(a,b) EIGEN_CAT2(a,b)
+
+// convert a token to a string
+#define EIGEN_MAKESTRING2(a) #a
+#define EIGEN_MAKESTRING(a) EIGEN_MAKESTRING2(a)
+
 // format used in Eigen's documentation
 // needed to define it here as escaping characters in CMake add_definition's argument seems very problematic.
 #define EIGEN_DOCS_IO_FORMAT IOFormat(3, AlignCols, " ", "\n", "", "")
@@ -219,6 +253,7 @@ typedef BaseClass Base; \
 typedef typename Eigen::ei_traits<Derived>::Scalar Scalar; \
 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; \
 typedef typename Base::PacketScalar PacketScalar; \
+typedef typename Base::CoeffReturnType CoeffReturnType; \
 typedef typename Eigen::ei_nested<Derived>::type Nested; \
 enum { RowsAtCompileTime = Eigen::ei_traits<Derived>::RowsAtCompileTime, \
       ColsAtCompileTime = Eigen::ei_traits<Derived>::ColsAtCompileTime, \
@@ -236,15 +271,4 @@ _EIGEN_GENERIC_PUBLIC_INTERFACE(Derived, Eigen::MatrixBase<Derived>)
 #define EIGEN_ENUM_MIN(a,b) (((int)a <= (int)b) ? (int)a : (int)b)
 #define EIGEN_ENUM_MAX(a,b) (((int)a >= (int)b) ? (int)a : (int)b)

-// just an empty macro !
-#define EIGEN_EMPTY
-
-// concatenate two tokens
-#define EIGEN_CAT2(a,b) a ## b
-#define EIGEN_CAT(a,b) EIGEN_CAT2(a,b)
-
-// convert a token to a string
-#define EIGEN_MAKESTRING2(a) #a
-#define EIGEN_MAKESTRING(a) EIGEN_MAKESTRING2(a)
-
 #endif // EIGEN_MACROS_H
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -65,7 +65,7 @@ inline void ei_handmade_aligned_free(void *ptr)
 }

 /** \internal allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment.
-  * On allocation error, the returned pointer is undefined, but if exceptions are enabled then a std::bad_alloc is thrown.
+  * On allocation error, the returned pointer is null, and if exceptions are enabled then a std::bad_alloc is thrown.
  */
 inline void* ei_aligned_malloc(size_t size)
 {
@@ -73,37 +73,30 @@ inline void* ei_aligned_malloc(size_t size)
    ei_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)");
  #endif

-  void *result;
-  #if EIGEN_HAS_POSIX_MEMALIGN && EIGEN_ARCH_WANTS_ALIGNMENT && !EIGEN_MALLOC_ALREADY_ALIGNED
-    #ifdef EIGEN_EXCEPTIONS
-      const int failed =
-    #endif
-    posix_memalign(&result, 16, size);
+  void *result;  
+  #if !EIGEN_ALIGN
+    result = malloc(size);
+  #elif EIGEN_MALLOC_ALREADY_ALIGNED
+    result = malloc(size);
+  #elif EIGEN_HAS_POSIX_MEMALIGN
+    if(posix_memalign(&result, 16, size)) result = 0;
+  #elif EIGEN_HAS_MM_MALLOC
+    result = _mm_malloc(size, 16);
+  #elif (defined _MSC_VER)
+    result = _aligned_malloc(size, 16);
  #else
-    #if !EIGEN_ARCH_WANTS_ALIGNMENT
-      result = malloc(size);
-    #elif EIGEN_MALLOC_ALREADY_ALIGNED
-      result = malloc(size);
-    #elif EIGEN_HAS_MM_MALLOC
-      result = _mm_malloc(size, 16);
-    #elif (defined _MSC_VER)
-      result = _aligned_malloc(size, 16);
-    #else
-      result = ei_handmade_aligned_malloc(size);
-    #endif
-    #ifdef EIGEN_EXCEPTIONS
-      const int failed = (result == 0);
-    #endif
+    result = ei_handmade_aligned_malloc(size);
  #endif
+    
  #ifdef EIGEN_EXCEPTIONS
-    if(failed)
+    if(result == 0)
      throw std::bad_alloc();
  #endif
  return result;
 }

 /** allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned.
-  * On allocation error, the returned pointer is undefined, but if exceptions are enabled then a std::bad_alloc is thrown.
+  * On allocation error, the returned pointer is null, and if exceptions are enabled then a std::bad_alloc is thrown.
  */
 template<bool Align> inline void* ei_conditional_aligned_malloc(size_t size)
 {
@@ -143,7 +136,7 @@ template<typename T, bool Align> inline T* ei_conditional_aligned_new(size_t siz
  */
 inline void ei_aligned_free(void *ptr)
 {
-  #if !EIGEN_ARCH_WANTS_ALIGNMENT
+  #if !EIGEN_ALIGN
    free(ptr);
  #elif EIGEN_MALLOC_ALREADY_ALIGNED
    free(ptr);
@@ -237,17 +230,40 @@ inline static int ei_alignmentOffset(const Scalar* ptr, int maxOffset)
                                                   ei_aligned_stack_free(PTR,sizeof(TYPE)*SIZE);} while(0)


-#if EIGEN_ARCH_WANTS_ALIGNMENT
+#if EIGEN_ALIGN
+  #ifdef EIGEN_EXCEPTIONS
+    #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
+      void* operator new(size_t size, const std::nothrow_t&) throw() { \
+        try { return Eigen::ei_conditional_aligned_malloc<NeedsToAlign>(size); } \
+        catch (...) { return 0; } \
+        return 0; \
+      }
+  #else
+    #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
+      void* operator new(size_t size, const std::nothrow_t&) throw() { \
+        return Eigen::ei_conditional_aligned_malloc<NeedsToAlign>(size); \
+      }
+  #endif
+
  #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \
-      void *operator new(size_t size) throw() { \
+      void *operator new(size_t size) { \
        return Eigen::ei_conditional_aligned_malloc<NeedsToAlign>(size); \
      } \
-      void *operator new[](size_t size) throw() { \
+      void *operator new[](size_t size) { \
        return Eigen::ei_conditional_aligned_malloc<NeedsToAlign>(size); \
      } \
-      void operator delete(void * ptr) { Eigen::ei_conditional_aligned_free<NeedsToAlign>(ptr); } \
-      void operator delete[](void * ptr) { Eigen::ei_conditional_aligned_free<NeedsToAlign>(ptr); } \
-      void *operator new(size_t, void *ptr) throw() { return ptr; } \
+      void operator delete(void * ptr) throw() { Eigen::ei_conditional_aligned_free<NeedsToAlign>(ptr); } \
+      void operator delete[](void * ptr) throw() { Eigen::ei_conditional_aligned_free<NeedsToAlign>(ptr); } \
+      /* in-place new and delete. since (at least afaik) there is no actual   */ \
+      /* memory allocated we can safely let the default implementation handle */ \
+      /* this particular case. */ \
+      static void *operator new(size_t size, void *ptr) { return ::operator new(size,ptr); } \
+      void operator delete(void * memory, void *ptr) throw() { return ::operator delete(memory,ptr); } \
+      /* nothrow-new (returns zero instead of std::bad_alloc) */ \
+      EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
+      void operator delete(void *ptr, const std::nothrow_t&) throw() { \
+        Eigen::ei_conditional_aligned_free<NeedsToAlign>(ptr); \
+      } \
      typedef void ei_operator_new_marker_type;
 #else
  #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
@@ -289,34 +305,34 @@ public:
        typedef aligned_allocator<U> other;
    };

-    pointer address( reference value ) const
+    pointer address( reference value ) const 
    {
        return &value;
    }

-    const_pointer address( const_reference value ) const
+    const_pointer address( const_reference value ) const 
    {
        return &value;
    }

-    aligned_allocator() throw()
+    aligned_allocator() throw() 
    {
    }

-    aligned_allocator( const aligned_allocator& ) throw()
+    aligned_allocator( const aligned_allocator& ) throw() 
    {
    }

    template<class U>
-    aligned_allocator( const aligned_allocator<U>& ) throw()
+    aligned_allocator( const aligned_allocator<U>& ) throw() 
    {
    }

-    ~aligned_allocator() throw()
+    ~aligned_allocator() throw() 
    {
    }

-    size_type max_size() const throw()
+    size_type max_size() const throw() 
    {
        return std::numeric_limits<size_type>::max();
    }
@@ -327,17 +343,17 @@ public:
        return static_cast<pointer>( ei_aligned_malloc( num * sizeof(T) ) );
    }

-    void construct( pointer p, const T& value )
+    void construct( pointer p, const T& value ) 
    {
        ::new( p ) T( value );
    }

-    void destroy( pointer p )
+    void destroy( pointer p ) 
    {
        p->~T();
    }

-    void deallocate( pointer p, size_type /*num*/ )
+    void deallocate( pointer p, size_type /*num*/ ) 
    {
        ei_aligned_free( p );
    }
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -64,6 +64,14 @@ template<typename T> struct ei_cleantype<T&>        { typedef typename ei_cleant
 template<typename T> struct ei_cleantype<const T*>  { typedef typename ei_cleantype<T>::type type; };
 template<typename T> struct ei_cleantype<T*>        { typedef typename ei_cleantype<T>::type type; };

+/** \internal Allows to enable/disable an overload
+  * according to a compile time condition.
+  */
+template<bool Condition, typename T> struct ei_enable_if;
+
+template<typename T> struct ei_enable_if<true,T>
+{ typedef T type; };
+
 /** \internal
  * Convenient struct to get the result type of a unary or binary functor.
  *
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -73,7 +73,9 @@
        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY,
        THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES,
        THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES,
-        INVALID_MATRIX_TEMPLATE_PARAMETERS
+        INVALID_MATRIX_TEMPLATE_PARAMETERS,
+        BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER,
+        THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX
      };
    };

--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -73,11 +73,7 @@ template<typename T> struct ei_functor_traits
  };
 };

-template<typename T> struct ei_packet_traits
-{
-  typedef T type;
-  enum {size=1};
-};
+template<typename T> struct ei_packet_traits;

 template<typename T> struct ei_unpacket_traits
 {
@@ -93,7 +89,7 @@ class ei_compute_matrix_flags
      inner_max_size = row_major_bit ? MaxCols : MaxRows,
      is_big = inner_max_size == Dynamic,
      is_packet_size_multiple = (Cols*Rows) % ei_packet_traits<Scalar>::size == 0,
-      aligned_bit = ((Options&AutoAlign) && (is_big || is_packet_size_multiple)) ? AlignedBit : 0,
+      aligned_bit = (((Options&DontAlign)==0) && (is_big || is_packet_size_multiple)) ? AlignedBit : 0,
      packet_access_bit = ei_packet_traits<Scalar>::size > 1 && aligned_bit ? PacketAccessBit : 0
    };

@@ -202,6 +198,28 @@ template<unsigned int Flags> struct ei_are_flags_consistent
  };
 };

+/** \internal Helper base class to add a scalar multiple operator
+  * overloads for complex types */
+template<typename Derived,typename Scalar,typename OtherScalar,
+         bool EnableIt = !ei_is_same_type<Scalar,OtherScalar>::ret >
+struct ei_special_scalar_op_base
+{
+  // dummy operator* so that the 
+  // "using ei_special_scalar_op_base::operator*" compiles
+  void operator*() const;
+};
+
+template<typename Derived,typename Scalar,typename OtherScalar>
+struct ei_special_scalar_op_base<Derived,Scalar,OtherScalar,true>
+{
+  const CwiseUnaryOp<ei_scalar_multiple2_op<Scalar,OtherScalar>, Derived>
+  operator*(const OtherScalar& scalar) const
+  {
+    return CwiseUnaryOp<ei_scalar_multiple2_op<Scalar,OtherScalar>, Derived>
+      (*static_cast<const Derived*>(this), ei_scalar_multiple2_op<Scalar,OtherScalar>(scalar));
+  }
+};
+
 /** \internal Gives the type of a sub-matrix or sub-vector of a matrix of type \a ExpressionType and size \a Size
  * TODO: could be a good idea to define a big ReturnType struct ??
  */
@@ -211,9 +229,26 @@ template<typename ExpressionType, int RowsOrSize=Dynamic, int Cols=Dynamic> stru
  typedef Block<ExpressionType, RowsOrSize, Cols> Type;
 };

-template<typename CurrentType, typename NewType> struct ei_cast_return_type
+template<typename ExpressionType> struct HNormalizedReturnType {
+
+  enum {
+    SizeAtCompileTime = ExpressionType::SizeAtCompileTime,
+    SizeMinusOne = SizeAtCompileTime==Dynamic ? Dynamic : SizeAtCompileTime-1
+  };
+  typedef Block<ExpressionType,
+                ei_traits<ExpressionType>::ColsAtCompileTime==1 ? SizeMinusOne : 1,
+                ei_traits<ExpressionType>::ColsAtCompileTime==1 ? 1 : SizeMinusOne> StartMinusOne;
+  typedef CwiseUnaryOp<ei_scalar_quotient1_op<typename ei_traits<ExpressionType>::Scalar>, 
+              NestByValue<StartMinusOne> > Type;
+};
+
+template<typename XprType, typename CastType> struct ei_cast_return_type
 {
-  typedef typename ei_meta_if<ei_is_same_type<CurrentType,NewType>::ret,const CurrentType&,NewType>::ret type;
+  typedef typename XprType::Scalar CurrentScalarType;
+  typedef typename ei_cleantype<CastType>::type _CastType;
+  typedef typename _CastType::Scalar NewScalarType;
+  typedef typename ei_meta_if<ei_is_same_type<CurrentScalarType,NewScalarType>::ret,
+                              const XprType&,CastType>::ret type;
 };

 #endif // EIGEN_XPRHELPER_H
--- a/Eigen/src/Geometry/AlignedBox.h
+++ b/Eigen/src/Geometry/AlignedBox.h
@@ -41,7 +41,7 @@ template <typename _Scalar, int _AmbientDim>
 class AlignedBox
 {
 public:
-EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim==Dynamic ? Dynamic : _AmbientDim+1)
+EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
  enum { AmbientDimAtCompileTime = _AmbientDim };
  typedef _Scalar Scalar;
  typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -85,6 +85,9 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim==
  /** \returns a non const reference to the maximal corner */
  inline VectorType& max() { return m_max; }

+  /** \returns the center of the box */
+  inline VectorType center() const { return (m_min + m_max) / 2; }
+
  /** \returns true if the point \a p is inside the box \c *this. */
  inline bool contains(const VectorType& p) const
  { return (m_min.cwise()<=p).all() && (p.cwise()<=m_max).all(); }
@@ -105,6 +108,14 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim==
  inline AlignedBox& clamp(const AlignedBox& b)
  { m_min = m_min.cwise().max(b.m_min); m_max = m_max.cwise().min(b.m_max); return *this; }

+  /** Returns an AlignedBox that is the intersection of \a b and \c *this */
+  inline AlignedBox intersection(const AlignedBox &b) const
+  { return AlignedBox(m_min.cwise().max(b.m_min), m_max.cwise().min(b.m_max)); }
+
+  /** Returns an AlignedBox that is the union of \a b and \c *this */
+  inline AlignedBox merged(const AlignedBox &b) const
+  { return AlignedBox(m_min.cwise().min(b.m_min), m_max.cwise().max(b.m_max)); }
+
  /** Translate \c *this by the vector \a t and returns a reference to \c *this. */
  inline AlignedBox& translate(const VectorType& t)
  { m_min += t; m_max += t; return *this; }
@@ -115,6 +126,12 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim==
    */
  inline Scalar squaredExteriorDistance(const VectorType& p) const;

+  /** \returns the squared distance between the boxes \a b and \c *this,
+    * and zero if the boxes intersect.
+    * \sa exteriorDistance()
+    */
+  inline Scalar squaredExteriorDistance(const AlignedBox& b) const;
+
  /** \returns the distance between the point \a p and the box \c *this,
    * and zero if \a p is inside the box.
    * \sa squaredExteriorDistance()
@@ -122,6 +139,13 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim==
  inline Scalar exteriorDistance(const VectorType& p) const
  { return ei_sqrt(squaredExteriorDistance(p)); }

+  /** \returns the distance between the boxes \a b and \c *this,
+    * and zero if the boxes intersect.
+    * \sa squaredExteriorDistance()
+    */
+  inline Scalar exteriorDistance(const AlignedBox& b) const
+  { return ei_sqrt(squaredExteriorDistance(b)); }
+
  /** \returns \c *this with scalar type casted to \a NewScalarType
    *
    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
@@ -170,4 +194,19 @@ inline Scalar AlignedBox<Scalar,AmbiantDim>::squaredExteriorDistance(const Vecto
  return dist2;
 }

+template<typename Scalar,int AmbiantDim>
+inline Scalar AlignedBox<Scalar,AmbiantDim>::squaredExteriorDistance(const AlignedBox& b) const
+{
+  Scalar dist2 = 0.;
+  Scalar aux;
+  for (int k=0; k<dim(); ++k)
+  {
+    if ((aux = (b.m_min[k]-m_max[k]))>0.)
+      dist2 += aux*aux;
+    else if ( (aux = (m_min[k]-b.m_max[k]))>0. )
+      dist2 += aux*aux;
+  }
+  return dist2;
+}
+
 #endif // EIGEN_ALIGNEDBOX_H
--- a/Eigen/src/Geometry/AngleAxis.h
+++ b/Eigen/src/Geometry/AngleAxis.h
@@ -33,6 +33,8 @@
  *
  * \param _Scalar the scalar type, i.e., the type of the coefficients.
  *
+  * \warning When setting up an AngleAxis object, the axis vector \b must \b be \b normalized.
+  *
  * The following two typedefs are provided for convenience:
  * \li \c AngleAxisf for \c float
  * \li \c AngleAxisd for \c double
@@ -82,7 +84,10 @@ public:
  /** Default constructor without initialization. */
  AngleAxis() {}
  /** Constructs and initialize the angle-axis rotation from an \a angle in radian
-    * and an \a axis which must be normalized. */
+    * and an \a axis which \b must \b be \b normalized.
+    * 
+    * \warning If the \a axis vector is not normalized, then the angle-axis object
+    *          represents an invalid rotation. */
  template<typename Derived>
  inline AngleAxis(Scalar angle, const MatrixBase<Derived>& axis) : m_axis(axis), m_angle(angle) {}
  /** Constructs and initialize the angle-axis rotation from a quaternion \a q. */
@@ -109,18 +114,6 @@ public:
  friend inline QuaternionType operator* (const QuaternionType& a, const AngleAxis& b)
  { return a * QuaternionType(b); }

-  /** Concatenates two rotations */
-  inline Matrix3 operator* (const Matrix3& other) const
-  { return toRotationMatrix() * other; }
-
-  /** Concatenates two rotations */
-  inline friend Matrix3 operator* (const Matrix3& a, const AngleAxis& b)
-  { return a * b.toRotationMatrix(); }
-
-  /** Applies rotation to vector */
-  inline Vector3 operator* (const Vector3& other) const
-  { return toRotationMatrix() * other; }
-
  /** \returns the inverse rotation, i.e., an angle-axis with opposite rotation angle */
  AngleAxis inverse() const
  { return AngleAxis(-m_angle, m_axis); }
--- a/Eigen/src/Geometry/CMakeLists.txt
+++ b/Eigen/src/Geometry/CMakeLists.txt
@@ -2,5 +2,7 @@ FILE(GLOB Eigen_Geometry_SRCS "*.h")

 INSTALL(FILES
  ${Eigen_Geometry_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Geometry
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Geometry COMPONENT Devel
  )
+
+ADD_SUBDIRECTORY(arch)
--- a/Eigen/src/Geometry/Homogeneous.h
+++ b/Eigen/src/Geometry/Homogeneous.h
@@ -0,0 +1,260 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_HOMOGENEOUS_H
+#define EIGEN_HOMOGENEOUS_H
+
+/** \geometry_module \ingroup Geometry_Module
+  * \nonstableyet
+  * \class Homogeneous
+  *
+  * \brief Expression of one (or a set of) homogeneous vector(s)
+  *
+  * \param MatrixType the type of the object in which we are making homogeneous
+  *
+  * This class represents an expression of one (or a set of) homogeneous vector(s).
+  * It is the return type of MatrixBase::homogeneous() and most of the time
+  * this is the only way it is used.
+  *
+  * \sa MatrixBase::homogeneous()
+  */
+template<typename MatrixType,int Direction>
+struct ei_traits<Homogeneous<MatrixType,Direction> >
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename ei_nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ei_unref<MatrixTypeNested>::type _MatrixTypeNested;
+  enum {
+    RowsPlusOne = (MatrixType::RowsAtCompileTime != Dynamic) ?
+                  int(MatrixType::RowsAtCompileTime) + 1 : Dynamic,
+    ColsPlusOne = (MatrixType::ColsAtCompileTime != Dynamic) ?
+                  int(MatrixType::ColsAtCompileTime) + 1 : Dynamic,
+    RowsAtCompileTime = Direction==Vertical  ?  RowsPlusOne : MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = Direction==Horizontal ? ColsPlusOne : MatrixType::ColsAtCompileTime,
+    MaxRowsAtCompileTime = RowsAtCompileTime,
+    MaxColsAtCompileTime = ColsAtCompileTime,
+    Flags = _MatrixTypeNested::Flags & HereditaryBits,
+    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+  };
+};
+
+template<typename MatrixType,typename Lhs> struct ei_homogeneous_left_product_impl;
+template<typename MatrixType,typename Rhs> struct ei_homogeneous_right_product_impl;
+
+template<typename MatrixType,int Direction> class Homogeneous
+  : public MatrixBase<Homogeneous<MatrixType,Direction> >
+{
+  public:
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(Homogeneous)
+
+    inline Homogeneous(const MatrixType& matrix)
+      : m_matrix(matrix)
+    {}
+
+    inline int rows() const { return m_matrix.rows() + (Direction==Vertical   ? 1 : 0); }
+    inline int cols() const { return m_matrix.cols() + (Direction==Horizontal ? 1 : 0); }
+
+    inline Scalar coeff(int row, int col) const
+    {
+      if(  (Direction==Vertical   && row==m_matrix.rows())
+        || (Direction==Horizontal && col==m_matrix.cols()))
+        return 1;
+      return m_matrix.coeff(row, col);
+    }
+
+    template<typename Rhs>
+    inline const ei_homogeneous_right_product_impl<Homogeneous,Rhs>
+    operator* (const MatrixBase<Rhs>& rhs) const
+    {
+      ei_assert(Direction==Horizontal);
+      return ei_homogeneous_right_product_impl<Homogeneous,Rhs>(m_matrix,rhs.derived());
+    }
+
+    template<typename Lhs> friend
+    inline const ei_homogeneous_left_product_impl<Homogeneous,Lhs>
+    operator* (const MatrixBase<Lhs>& lhs, const Homogeneous& rhs)
+    {
+      ei_assert(Direction==Vertical);
+      return ei_homogeneous_left_product_impl<Homogeneous,Lhs>(lhs.derived(),rhs.m_matrix);
+    }
+
+    template<typename Scalar, int Dim, int Mode> friend
+    inline const ei_homogeneous_left_product_impl<Homogeneous,
+      typename Transform<Scalar,Dim,Mode>::AffinePart>
+    operator* (const Transform<Scalar,Dim,Mode>& tr, const Homogeneous& rhs)
+    {
+      ei_assert(Direction==Vertical);
+      return ei_homogeneous_left_product_impl<Homogeneous,typename Transform<Scalar,Dim,Mode>::AffinePart>
+        (tr.affine(),rhs.m_matrix);
+    }
+
+    template<typename Scalar, int Dim> friend
+    inline const ei_homogeneous_left_product_impl<Homogeneous,
+      typename Transform<Scalar,Dim,Projective>::MatrixType>
+    operator* (const Transform<Scalar,Dim,Projective>& tr, const Homogeneous& rhs)
+    {
+      ei_assert(Direction==Vertical);
+      return ei_homogeneous_left_product_impl<Homogeneous,typename Transform<Scalar,Dim,Projective>::MatrixType>
+        (tr.matrix(),rhs.m_matrix);
+    }
+
+  protected:
+    const typename MatrixType::Nested m_matrix;
+};
+
+/** \geometry_module
+  * \nonstableyet
+  * \return an expression of the equivalent homogeneous vector
+  *
+  * \vectoronly
+  *
+  * Example: \include MatrixBase_homogeneous.cpp
+  * Output: \verbinclude MatrixBase_homogeneous.out
+  *
+  * \sa class Homogeneous
+  */
+template<typename Derived>
+inline const typename MatrixBase<Derived>::HomogeneousReturnType
+MatrixBase<Derived>::homogeneous() const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  return derived();
+}
+
+/** \geometry_module
+  * \nonstableyet
+  * \returns a matrix expression of homogeneous column (or row) vectors
+  *
+  * Example: \include PartialRedux_homogeneous.cpp
+  * Output: \verbinclude PartialRedux_homogeneous.out
+  *
+  * \sa MatrixBase::homogeneous() */
+template<typename ExpressionType, int Direction>
+inline const Homogeneous<ExpressionType,Direction>
+PartialRedux<ExpressionType,Direction>::homogeneous() const
+{
+  return _expression();
+}
+
+/** \geometry_module
+  * \nonstableyet
+  * \returns an expression of the homogeneous normalized vector of \c *this
+  *
+  * Example: \include MatrixBase_hnormalized.cpp
+  * Output: \verbinclude MatrixBase_hnormalized.out
+  *
+  * \sa PartialRedux::hnormalized() */
+template<typename Derived>
+inline const typename MatrixBase<Derived>::HNormalizedReturnType
+MatrixBase<Derived>::hnormalized() const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  return StartMinusOne(derived(),0,0,
+    ColsAtCompileTime==1?size()-1:1,
+    ColsAtCompileTime==1?1:size()-1).nestByValue() / coeff(size()-1);
+}
+
+/** \geometry_module
+  * \nonstableyet
+  * \returns an expression of the homogeneous normalized vector of \c *this
+  *
+  * Example: \include DirectionWise_hnormalized.cpp
+  * Output: \verbinclude DirectionWise_hnormalized.out
+  *
+  * \sa MatrixBase::hnormalized() */
+template<typename ExpressionType, int Direction>
+inline const typename PartialRedux<ExpressionType,Direction>::HNormalizedReturnType
+PartialRedux<ExpressionType,Direction>::hnormalized() const
+{
+  return HNormalized_Block(_expression(),0,0,
+      Direction==Vertical   ? _expression().rows()-1 : _expression().rows(),
+      Direction==Horizontal ? _expression().cols()-1 : _expression().cols()).nestByValue()
+    .cwise()/
+      Replicate<NestByValue<HNormalized_Factors>,
+                Direction==Vertical   ? HNormalized_SizeMinusOne : 1,
+                Direction==Horizontal ? HNormalized_SizeMinusOne : 1>
+        (HNormalized_Factors(_expression(),
+          Direction==Vertical    ? _expression().rows()-1:0,
+          Direction==Horizontal  ? _expression().cols()-1:0,
+          Direction==Vertical    ? 1 : _expression().rows(),
+          Direction==Horizontal  ? 1 : _expression().cols()).nestByValue(),
+         Direction==Vertical   ? _expression().rows()-1 : 1,
+         Direction==Horizontal ? _expression().cols()-1 : 1).nestByValue();
+}
+
+template<typename MatrixType,typename Lhs>
+struct ei_homogeneous_left_product_impl<Homogeneous<MatrixType,Vertical>,Lhs>
+  : public ReturnByValue<ei_homogeneous_left_product_impl<Homogeneous<MatrixType,Vertical>,Lhs>,
+                         Matrix<typename ei_traits<MatrixType>::Scalar,
+                                Lhs::RowsAtCompileTime,MatrixType::ColsAtCompileTime> >
+{
+  typedef typename ei_cleantype<typename Lhs::Nested>::type LhsNested;
+  ei_homogeneous_left_product_impl(const Lhs& lhs, const MatrixType& rhs)
+    : m_lhs(lhs), m_rhs(rhs)
+  {}
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    // FIXME investigate how to allow lazy evaluation of this product when possible
+    dst = Block<LhsNested,
+              LhsNested::RowsAtCompileTime,
+              LhsNested::ColsAtCompileTime==Dynamic?Dynamic:LhsNested::ColsAtCompileTime-1>
+            (m_lhs,0,0,m_lhs.rows(),m_lhs.cols()-1) * m_rhs;
+    dst += m_lhs.col(m_lhs.cols()-1).rowwise()
+            .template replicate<MatrixType::ColsAtCompileTime>(m_rhs.cols());
+  }
+
+  const typename Lhs::Nested m_lhs;
+  const typename MatrixType::Nested m_rhs;
+};
+
+template<typename MatrixType,typename Rhs>
+struct ei_homogeneous_right_product_impl<Homogeneous<MatrixType,Horizontal>,Rhs>
+  : public ReturnByValue<ei_homogeneous_right_product_impl<Homogeneous<MatrixType,Horizontal>,Rhs>,
+                         Matrix<typename ei_traits<MatrixType>::Scalar,
+                                MatrixType::RowsAtCompileTime, Rhs::ColsAtCompileTime> >
+{
+  typedef typename ei_cleantype<typename Rhs::Nested>::type RhsNested;
+  ei_homogeneous_right_product_impl(const MatrixType& lhs, const Rhs& rhs)
+    : m_lhs(lhs), m_rhs(rhs)
+  {}
+
+  template<typename Dest> void evalTo(Dest& dst) const
+  {
+    // FIXME investigate how to allow lazy evaluation of this product when possible
+    dst = m_lhs * Block<RhsNested,
+                        RhsNested::RowsAtCompileTime==Dynamic?Dynamic:RhsNested::RowsAtCompileTime-1,
+                        RhsNested::ColsAtCompileTime>
+            (m_rhs,0,0,m_rhs.rows()-1,m_rhs.cols());
+    dst += m_rhs.row(m_rhs.rows()-1).colwise()
+            .template replicate<MatrixType::RowsAtCompileTime>(m_lhs.rows());
+  }
+
+  const typename MatrixType::Nested m_lhs;
+  const typename Rhs::Nested m_rhs;
+
+};
+
+#endif // EIGEN_HOMOGENEOUS_H
--- a/Eigen/src/Geometry/OrthoMethods.h
+++ b/Eigen/src/Geometry/OrthoMethods.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra. Eigen itself is part of the KDE project.
 //
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // Eigen is free software; you can redistribute it and/or
@@ -31,6 +31,7 @@
  * \returns the cross product of \c *this and \a other
  *
  * Here is a very good explanation of cross-product: http://xkcd.com/199/
+  * \sa MatrixBase::cross3()
  */
 template<typename Derived>
 template<typename OtherDerived>
@@ -38,6 +39,7 @@ inline typename MatrixBase<Derived>::PlainMatrixType
 MatrixBase<Derived>::cross(const MatrixBase<OtherDerived>& other) const
 {
  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived,3)
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,3)

  // Note that there is no need for an expression here since the compiler
  // optimize such a small temporary very well (even within a complex expression)
@@ -50,15 +52,116 @@ MatrixBase<Derived>::cross(const MatrixBase<OtherDerived>& other) const
  );
 }

+template< int Arch,typename VectorLhs,typename VectorRhs,
+          typename Scalar = typename VectorLhs::Scalar,
+          int Vectorizable = (VectorLhs::Flags&VectorRhs::Flags)&PacketAccessBit>
+struct ei_cross3_impl {
+  inline static typename ei_plain_matrix_type<VectorLhs>::type
+  run(const VectorLhs& lhs, const VectorRhs& rhs)
+  {
+    return typename ei_plain_matrix_type<VectorLhs>::type(
+      lhs.coeff(1) * rhs.coeff(2) - lhs.coeff(2) * rhs.coeff(1),
+      lhs.coeff(2) * rhs.coeff(0) - lhs.coeff(0) * rhs.coeff(2),
+      lhs.coeff(0) * rhs.coeff(1) - lhs.coeff(1) * rhs.coeff(0),
+      0
+    );
+  }
+};
+
+/** \geometry_module
+  *
+  * \returns the cross product of \c *this and \a other using only the x, y, and z coefficients
+  *
+  * The size of \c *this and \a other must be four. This function is especially useful
+  * when using 4D vectors instead of 3D ones to get advantage of SSE/AltiVec vectorization.
+  *
+  * \sa MatrixBase::cross()
+  */
+template<typename Derived>
+template<typename OtherDerived>
+inline typename MatrixBase<Derived>::PlainMatrixType
+MatrixBase<Derived>::cross3(const MatrixBase<OtherDerived>& other) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived,4)
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,4)
+
+  typedef typename ei_nested<Derived,2>::type DerivedNested;
+  typedef typename ei_nested<OtherDerived,2>::type OtherDerivedNested;
+  const DerivedNested lhs(derived());
+  const OtherDerivedNested rhs(other.derived());
+
+  return ei_cross3_impl<EiArch,typename ei_cleantype<DerivedNested>::type,
+                               typename ei_cleantype<OtherDerivedNested>::type>::run(lhs,rhs);
+}
+
+/** \returns a matrix expression of the cross product of each column or row
+  * of the referenced expression with the \a other vector.
+  *
+  * The referenced matrix must have one dimension equal to 3.
+  * The result matrix has the same dimensions than the referenced one.
+  *
+  * \geometry_module
+  *
+  * \sa MatrixBase::cross() */
+template<typename ExpressionType, int Direction>
+template<typename OtherDerived>
+const typename PartialRedux<ExpressionType,Direction>::CrossReturnType
+PartialRedux<ExpressionType,Direction>::cross(const MatrixBase<OtherDerived>& other) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,3)
+  EIGEN_STATIC_ASSERT((ei_is_same_type<Scalar, typename OtherDerived::Scalar>::ret),
+    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
+  CrossReturnType res(_expression().rows(),_expression().cols());
+  if(Direction==Vertical)
+  {
+    ei_assert(CrossReturnType::RowsAtCompileTime==3 && "the matrix must have exactly 3 rows");
+    res.row(0) = _expression().row(1) * other.coeff(2) - _expression().row(2) * other.coeff(1);
+    res.row(1) = _expression().row(2) * other.coeff(0) - _expression().row(0) * other.coeff(2);
+    res.row(2) = _expression().row(0) * other.coeff(1) - _expression().row(1) * other.coeff(0);
+  }
+  else
+  {
+    ei_assert(CrossReturnType::ColsAtCompileTime==3 && "the matrix must have exactly 3 columns");
+    res.col(0) = _expression().col(1) * other.coeff(2) - _expression().col(2) * other.coeff(1);
+    res.col(1) = _expression().col(2) * other.coeff(0) - _expression().col(0) * other.coeff(2);
+    res.col(2) = _expression().col(0) * other.coeff(1) - _expression().col(1) * other.coeff(0);
+  }
+  return res;
+}
+
 template<typename Derived, int Size = Derived::SizeAtCompileTime>
 struct ei_unitOrthogonal_selector
 {
  typedef typename ei_plain_matrix_type<Derived>::type VectorType;
  typedef typename ei_traits<Derived>::Scalar Scalar;
  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<Scalar,2,1> Vector2;
  inline static VectorType run(const Derived& src)
  {
-    VectorType perp(src.size());
+    VectorType perp = VectorType::Zero(src.size());
+    int maxi = 0;
+    int sndi = 0;
+    src.cwise().abs().maxCoeff(&maxi);
+    if (maxi==0)
+      sndi = 1;
+    RealScalar invnm = RealScalar(1)/(Vector2() << src.coeff(sndi),src.coeff(maxi)).finished().norm();
+    perp.coeffRef(maxi) = -ei_conj(src.coeff(sndi)) * invnm;
+    perp.coeffRef(sndi) =  ei_conj(src.coeff(maxi)) * invnm;
+
+    return perp;
+   }
+};
+
+template<typename Derived>
+struct ei_unitOrthogonal_selector<Derived,3>
+{
+  typedef typename ei_plain_matrix_type<Derived>::type VectorType;
+  typedef typename ei_traits<Derived>::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  inline static VectorType run(const Derived& src)
+  {
+    VectorType perp;
    /* Let us compute the crossed product of *this with a vector
     * that is not too close to being colinear to *this.
     */
@@ -85,9 +188,6 @@ struct ei_unitOrthogonal_selector
      perp.coeffRef(1) = -ei_conj(src.z())*invnm;
      perp.coeffRef(2) = ei_conj(src.y())*invnm;
    }
-    if( (Derived::SizeAtCompileTime!=Dynamic && Derived::SizeAtCompileTime>3)
-     || (Derived::SizeAtCompileTime==Dynamic && src.size()>3) )
-      perp.end(src.size()-3).setZero();

    return perp;
   }
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@@ -62,6 +62,8 @@ class Quaternion : public RotationBase<Quaternion<_Scalar>,3>
 {
  typedef RotationBase<Quaternion<_Scalar>,3> Base;

+
+
 public:
  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,4)

@@ -189,9 +191,6 @@ public:

  Quaternion slerp(Scalar t, const Quaternion& other) const;

-  template<typename Derived>
-  Vector3 operator* (const MatrixBase<Derived>& vec) const;
-
  /** \returns \c *this with scalar type casted to \a NewScalarType
    *
    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
@@ -213,6 +212,8 @@ public:
  bool isApprox(const Quaternion& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
  { return m_coeffs.isApprox(other.m_coeffs, prec); }

+  Vector3 _transformVector(Vector3 v) const;
+
 protected:
  Coefficients m_coeffs;
 };
@@ -224,17 +225,24 @@ typedef Quaternion<float> Quaternionf;
  * double precision quaternion type */
 typedef Quaternion<double> Quaterniond;

+// Generic Quaternion * Quaternion product
+template<int Arch,typename Scalar> inline Quaternion<Scalar>
+ei_quaternion_product(const Quaternion<Scalar>& a, const Quaternion<Scalar>& b)
+{
+  return Quaternion<Scalar>
+  (
+    a.w() * b.w() - a.x() * b.x() - a.y() * b.y() - a.z() * b.z(),
+    a.w() * b.x() + a.x() * b.w() + a.y() * b.z() - a.z() * b.y(),
+    a.w() * b.y() + a.y() * b.w() + a.z() * b.x() - a.x() * b.z(),
+    a.w() * b.z() + a.z() * b.w() + a.x() * b.y() - a.y() * b.x()
+  );
+}
+
 /** \returns the concatenation of two rotations as a quaternion-quaternion product */
 template <typename Scalar>
 inline Quaternion<Scalar> Quaternion<Scalar>::operator* (const Quaternion& other) const
 {
-  return Quaternion
-  (
-    this->w() * other.w() - this->x() * other.x() - this->y() * other.y() - this->z() * other.z(),
-    this->w() * other.x() + this->x() * other.w() + this->y() * other.z() - this->z() * other.y(),
-    this->w() * other.y() + this->y() * other.w() + this->z() * other.x() - this->x() * other.z(),
-    this->w() * other.z() + this->z() * other.w() + this->x() * other.y() - this->y() * other.x()
-  );
+  return ei_quaternion_product<EiArch>(*this,other);
 }

 /** \sa operator*(Quaternion) */
@@ -252,17 +260,15 @@ inline Quaternion<Scalar>& Quaternion<Scalar>::operator*= (const Quaternion& oth
  *   - Via a Matrix3: 24 + 15n
  */
 template <typename Scalar>
-template<typename Derived>
 inline typename Quaternion<Scalar>::Vector3
-Quaternion<Scalar>::operator* (const MatrixBase<Derived>& v) const
+Quaternion<Scalar>::_transformVector(Vector3 v) const
 {
    // Note that this algorithm comes from the optimization by hand
    // of the conversion to a Matrix followed by a Matrix/Vector product.
    // It appears to be much faster than the common algorithm found
    // in the litterature (30 versus 39 flops). It also requires two
    // Vector3 as temporaries.
-    Vector3 uv;
-    uv = 2 * this->vec().cross(v);
+    Vector3 uv = Scalar(2) * this->vec().cross(v);
    return v + this->w() * uv + this->vec().cross(uv);
 }

@@ -346,7 +352,6 @@ inline Quaternion<Scalar>& Quaternion<Scalar>::setFromTwoVectors(const MatrixBas
 {
  Vector3 v0 = a.normalized();
  Vector3 v1 = b.normalized();
-  Vector3 axis = v0.cross(v1);
  Scalar c = v0.dot(v1);

  // if dot == 1, vectors are the same
@@ -354,7 +359,17 @@ inline Quaternion<Scalar>& Quaternion<Scalar>::setFromTwoVectors(const MatrixBas
  {
    // set to identity
    this->w() = 1; this->vec().setZero();
+    return *this;
  }
+  // if dot == -1, vectors are opposites
+  if (ei_isApprox(c,Scalar(-1)))
+  {
+    this->vec() = v0.unitOrthogonal();
+    this->w() = 0;
+    return *this;
+  }
+
+  Vector3 axis = v0.cross(v1);
  Scalar s = ei_sqrt((Scalar(1)+c)*Scalar(2));
  Scalar invs = Scalar(1)/s;
  this->vec() = axis * invs;
--- a/Eigen/src/Geometry/RotationBase.h
+++ b/Eigen/src/Geometry/RotationBase.h
@@ -42,10 +42,31 @@ class RotationBase
    enum { Dim = _Dim };
    /** the scalar type of the coefficients */
    typedef typename ei_traits<Derived>::Scalar Scalar;
-    
+
    /** corresponding linear transformation matrix type */
    typedef Matrix<Scalar,Dim,Dim> RotationMatrixType;
+    typedef Matrix<Scalar,Dim,1> VectorType;

+  protected:
+    template<typename MatrixType, bool IsVector=MatrixType::IsVectorAtCompileTime>
+    struct generic_product_selector
+    {
+      typedef RotationMatrixType ReturnType;
+      inline static RotationMatrixType run(const Derived& r, const MatrixType& m)
+      { return r.toRotationMatrix() * m; }
+    };
+
+    template<typename OtherVectorType>
+    struct generic_product_selector<OtherVectorType,true>
+    {
+      typedef VectorType ReturnType;
+      inline static VectorType run(const Derived& r, const OtherVectorType& v)
+      {
+        return r._transformVector(v);
+      }
+    };
+
+  public:
    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
    inline Derived& derived() { return *static_cast<Derived*>(this); }

@@ -59,13 +80,33 @@ class RotationBase
    inline Transform<Scalar,Dim> operator*(const Translation<Scalar,Dim>& t) const
    { return toRotationMatrix() * t; }

-    /** \returns the concatenation of the rotation \c *this with a scaling \a s */
-    inline RotationMatrixType operator*(const Scaling<Scalar,Dim>& s) const
-    { return toRotationMatrix() * s; }
+    /** \returns the concatenation of the rotation \c *this with a uniform scaling \a s */
+    inline RotationMatrixType operator*(const UniformScaling<Scalar>& s) const
+    { return toRotationMatrix() * s.factor(); }

-    /** \returns the concatenation of the rotation \c *this with an affine transformation \a t */
-    inline Transform<Scalar,Dim> operator*(const Transform<Scalar,Dim>& t) const
+    /** \returns the concatenation of the rotation \c *this with a generic expression \a e
+      * \a e can be:
+      *  - a DimxDim linear transformation matrix (including an axis aligned scaling)
+      *  - a vector of size Dim
+      */
+    template<typename OtherDerived>
+    inline typename generic_product_selector<OtherDerived,OtherDerived::IsVectorAtCompileTime>::ReturnType
+    operator*(const MatrixBase<OtherDerived>& e) const
+    { return generic_product_selector<OtherDerived>::run(derived(), e.derived()); }
+
+    /** \returns the concatenation of a linear transformation \a l with the rotation \a r */
+    template<typename OtherDerived> friend
+    inline RotationMatrixType operator*(const MatrixBase<OtherDerived>& l, const Derived& r)
+    { return l.derived() * r.toRotationMatrix(); }
+
+    /** \returns the concatenation of the rotation \c *this with a transformation \a t */
+    template<int Mode>
+    inline Transform<Scalar,Dim,Mode> operator*(const Transform<Scalar,Dim,Mode>& t) const
    { return toRotationMatrix() * t; }
+
+    template<typename OtherVectorType>
+    inline VectorType _transformVector(const OtherVectorType& v) const
+    { return toRotationMatrix() * v; }
 };

 /** \geometry_module
--- a/Eigen/src/Geometry/Scaling.h
+++ b/Eigen/src/Geometry/Scaling.h
@@ -29,102 +29,65 @@
  *
  * \class Scaling
  *
-  * \brief Represents a possibly non uniform scaling transformation
+  * \brief Represents a generic uniform scaling transformation
  *
  * \param _Scalar the scalar type, i.e., the type of the coefficients.
-  * \param _Dim the  dimension of the space, can be a compile time value or Dynamic
  *
-  * \note This class is not aimed to be used to store a scaling transformation,
+  * This class represent a uniform scaling transformation. It is the return
+  * type of Scaling(Scalar), and most of the time this is the only way it
+  * is used. In particular, this class is not aimed to be used to store a scaling transformation,
  * but rather to make easier the constructions and updates of Transform objects.
  *
-  * \sa class Translation, class Transform
+  * To represent an axis aligned scaling, use the DiagonalMatrix class.
+  *
+  * \sa Scaling(), class DiagonalMatrix, MatrixBase::asDiagonal(), class Translation, class Transform
  */
-template<typename _Scalar, int _Dim>
-class Scaling
+template<typename _Scalar>
+class UniformScaling
 {
 public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Dim)
-  /** dimension of the space */
-  enum { Dim = _Dim };
  /** the scalar type of the coefficients */
  typedef _Scalar Scalar;
-  /** corresponding vector type */
-  typedef Matrix<Scalar,Dim,1> VectorType;
-  /** corresponding linear transformation matrix type */
-  typedef Matrix<Scalar,Dim,Dim> LinearMatrixType;
-  /** corresponding translation type */
-  typedef Translation<Scalar,Dim> TranslationType;
-  /** corresponding affine transformation type */
-  typedef Transform<Scalar,Dim> TransformType;

 protected:

-  VectorType m_coeffs;
+  Scalar m_factor;

 public:

  /** Default constructor without initialization. */
-  Scaling() {}
+  UniformScaling() {}
  /** Constructs and initialize a uniform scaling transformation */
-  explicit inline Scaling(const Scalar& s) { m_coeffs.setConstant(s); }
-  /** 2D only */
-  inline Scaling(const Scalar& sx, const Scalar& sy)
-  {
-    ei_assert(Dim==2);
-    m_coeffs.x() = sx;
-    m_coeffs.y() = sy;
-  }
-  /** 3D only */
-  inline Scaling(const Scalar& sx, const Scalar& sy, const Scalar& sz)
-  {
-    ei_assert(Dim==3);
-    m_coeffs.x() = sx;
-    m_coeffs.y() = sy;
-    m_coeffs.z() = sz;
-  }
-  /** Constructs and initialize the scaling transformation from a vector of scaling coefficients */
-  explicit inline Scaling(const VectorType& coeffs) : m_coeffs(coeffs) {}
+  explicit inline UniformScaling(const Scalar& s) : m_factor(s) {}

-  const VectorType& coeffs() const { return m_coeffs; }
-  VectorType& coeffs() { return m_coeffs; }
+  inline const Scalar& factor() const { return m_factor; }
+  inline Scalar& factor() { return m_factor; }

-  /** Concatenates two scaling */
-  inline Scaling operator* (const Scaling& other) const
-  { return Scaling(coeffs().cwise() * other.coeffs()); }
+  /** Concatenates two uniform scaling */
+  inline UniformScaling operator* (const UniformScaling& other) const
+  { return UniformScaling(m_factor * other.factor()); }

-  /** Concatenates a scaling and a translation */
-  inline TransformType operator* (const TranslationType& t) const;
+  /** Concatenates a uniform scaling and a translation */
+  template<int Dim>
+  inline Transform<Scalar,Dim> operator* (const Translation<Scalar,Dim>& t) const;

-  /** Concatenates a scaling and an affine transformation */
-  inline TransformType operator* (const TransformType& t) const;
+  /** Concatenates a uniform scaling and an affine transformation */
+  template<int Dim>
+  inline Transform<Scalar,Dim> operator* (const Transform<Scalar,Dim>& t) const;

-  /** Concatenates a scaling and a linear transformation matrix */
+  /** Concatenates a uniform scaling and a linear transformation matrix */
  // TODO returns an expression
-  inline LinearMatrixType operator* (const LinearMatrixType& other) const
-  { return coeffs().asDiagonal() * other; }
-
-  /** Concatenates a linear transformation matrix and a scaling */
-  // TODO returns an expression
-  friend inline LinearMatrixType operator* (const LinearMatrixType& other, const Scaling& s)
-  { return other * s.coeffs().asDiagonal(); }
-
  template<typename Derived>
-  inline LinearMatrixType operator*(const RotationBase<Derived,Dim>& r) const
-  { return *this * r.toRotationMatrix(); }
+  inline typename ei_plain_matrix_type<Derived>::type operator* (const MatrixBase<Derived>& other) const
+  { return other * m_factor; }

-  /** Applies scaling to vector */
-  inline VectorType operator* (const VectorType& other) const
-  { return coeffs().asDiagonal() * other; }
+  template<typename Derived,int Dim>
+  inline Matrix<Scalar,Dim,Dim> operator*(const RotationBase<Derived,Dim>& r) const
+  { return r.toRotationMatrix() * m_factor; }

  /** \returns the inverse scaling */
-  inline Scaling inverse() const
-  { return Scaling(coeffs().cwise().inverse()); }
-
-  inline Scaling& operator=(const Scaling& other)
-  {
-    m_coeffs = other.m_coeffs;
-    return *this;
-  }
+  inline UniformScaling inverse() const
+  { return UniformScaling(Scalar(1)/m_factor); }

  /** \returns \c *this with scalar type casted to \a NewScalarType
    *
@@ -132,50 +95,65 @@ public:
    * then this function smartly returns a const reference to \c *this.
    */
  template<typename NewScalarType>
-  inline typename ei_cast_return_type<Scaling,Scaling<NewScalarType,Dim> >::type cast() const
-  { return typename ei_cast_return_type<Scaling,Scaling<NewScalarType,Dim> >::type(*this); }
+  inline UniformScaling<NewScalarType> cast() const
+  { return UniformScaling<NewScalarType>(NewScalarType(m_factor)); }

  /** Copy constructor with scalar type conversion */
  template<typename OtherScalarType>
-  inline explicit Scaling(const Scaling<OtherScalarType,Dim>& other)
-  { m_coeffs = other.coeffs().template cast<Scalar>(); }
+  inline explicit UniformScaling(const UniformScaling<OtherScalarType>& other)
+  { m_factor = Scalar(other.factor()); }

  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
    * determined by \a prec.
    *
    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Scaling& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_coeffs.isApprox(other.m_coeffs, prec); }
+  bool isApprox(const UniformScaling& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
+  { return ei_isApprox(m_factor, other.factor(), prec); }

 };

+/** Concatenates a linear transformation matrix and a uniform scaling */
+// NOTE this operator is defiend in MatrixBase and not as a friend function
+// of UniformScaling to fix an internal crash of Intel's ICC
+template<typename Derived> const typename MatrixBase<Derived>::ScalarMultipleReturnType
+MatrixBase<Derived>::operator*(const UniformScaling<Scalar>& s) const
+{ return derived() * s.factor(); }
+
+/** Constructs a uniform scaling from scale factor \a s */
+static inline UniformScaling<float> Scaling(float s) { return UniformScaling<float>(s); }
+/** Constructs a uniform scaling from scale factor \a s */
+static inline UniformScaling<double> Scaling(double s) { return UniformScaling<double>(s); }
+/** Constructs a uniform scaling from scale factor \a s */
+template<typename RealScalar>
+static inline UniformScaling<std::complex<RealScalar> > Scaling(const std::complex<RealScalar>& s)
+{ return UniformScaling<std::complex<RealScalar> >(s); }
+
+/** Constructs a 2D axis aligned scaling */
+template<typename Scalar>
+static inline DiagonalMatrix<Scalar,2> Scaling(Scalar sx, Scalar sy)
+{ return DiagonalMatrix<Scalar,2>(sx, sy); }
+/** Constructs a 3D axis aligned scaling */
+template<typename Scalar>
+static inline DiagonalMatrix<Scalar,3> Scaling(Scalar sx, Scalar sy, Scalar sz)
+{ return DiagonalMatrix<Scalar,3>(sx, sy, sz); }
+
+/** Constructs an axis aligned scaling expression from vector expression \a coeffs
+  * This is an alias for coeffs.asDiagonal()
+  */
+template<typename Derived>
+static inline const DiagonalMatrixWrapper<Derived> Scaling(const MatrixBase<Derived>& coeffs)
+{ return coeffs.asDiagonal(); }
+
 /** \addtogroup Geometry_Module */
 //@{
-typedef Scaling<float, 2> Scaling2f;
-typedef Scaling<double,2> Scaling2d;
-typedef Scaling<float, 3> Scaling3f;
-typedef Scaling<double,3> Scaling3d;
+/** \deprecated */
+typedef DiagonalMatrix<float, 2> AlignedScaling2f;
+/** \deprecated */
+typedef DiagonalMatrix<double,2> AlignedScaling2d;
+/** \deprecated */
+typedef DiagonalMatrix<float, 3> AlignedScaling3f;
+/** \deprecated */
+typedef DiagonalMatrix<double,3> AlignedScaling3d;
 //@}

-template<typename Scalar, int Dim>
-inline typename Scaling<Scalar,Dim>::TransformType
-Scaling<Scalar,Dim>::operator* (const TranslationType& t) const
-{
-  TransformType res;
-  res.matrix().setZero();
-  res.linear().diagonal() = coeffs();
-  res.translation() = m_coeffs.cwise() * t.vector();
-  res(Dim,Dim) = Scalar(1);
-  return res;
-}
-
-template<typename Scalar, int Dim>
-inline typename Scaling<Scalar,Dim>::TransformType
-Scaling<Scalar,Dim>::operator* (const TransformType& t) const
-{
-  TransformType res = t;
-  res.prescale(m_coeffs);
-  return res;
-}
-
 #endif // EIGEN_SCALING_H
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
--- a/Eigen/src/Geometry/Translation.h
+++ b/Eigen/src/Geometry/Translation.h
@@ -52,10 +52,8 @@ public:
  typedef Matrix<Scalar,Dim,1> VectorType;
  /** corresponding linear transformation matrix type */
  typedef Matrix<Scalar,Dim,Dim> LinearMatrixType;
-  /** corresponding scaling transformation type */
-  typedef Scaling<Scalar,Dim> ScalingType;
  /** corresponding affine transformation type */
-  typedef Transform<Scalar,Dim> TransformType;
+  typedef Transform<Scalar,Dim> AffineTransformType;

 protected:

@@ -80,7 +78,7 @@ public:
    m_coeffs.y() = sy;
    m_coeffs.z() = sz;
  }
-  /** Constructs and initialize the scaling transformation from a vector of scaling coefficients */
+  /** Constructs and initialize the translation transformation from a vector of translation coefficients */
  explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {}

  const VectorType& vector() const { return m_coeffs; }
@@ -90,31 +88,40 @@ public:
  inline Translation operator* (const Translation& other) const
  { return Translation(m_coeffs + other.m_coeffs); }

-  /** Concatenates a translation and a scaling */
-  inline TransformType operator* (const ScalingType& other) const;
+  /** Concatenates a translation and a uniform scaling */
+  inline AffineTransformType operator* (const UniformScaling<Scalar>& other) const;

  /** Concatenates a translation and a linear transformation */
-  inline TransformType operator* (const LinearMatrixType& linear) const;
+  template<typename OtherDerived>
+  inline AffineTransformType operator* (const MatrixBase<OtherDerived>& linear) const;

+  /** Concatenates a translation and a rotation */
  template<typename Derived>
-  inline TransformType operator*(const RotationBase<Derived,Dim>& r) const
+  inline AffineTransformType operator*(const RotationBase<Derived,Dim>& r) const
  { return *this * r.toRotationMatrix(); }

-  /** Concatenates a linear transformation and a translation */
+  /** \returns the concatenation of a linear transformation \a l with the translation \a t */
  // its a nightmare to define a templated friend function outside its declaration
-  friend inline TransformType operator* (const LinearMatrixType& linear, const Translation& t)
+  template<typename OtherDerived> friend
+  inline AffineTransformType operator*(const MatrixBase<OtherDerived>& linear, const Translation& t)
  {
-    TransformType res;
+    AffineTransformType res;
    res.matrix().setZero();
-    res.linear() = linear;
-    res.translation() = linear * t.m_coeffs;
+    res.linear() = linear.derived();
+    res.translation() = linear.derived() * t.m_coeffs;
    res.matrix().row(Dim).setZero();
    res(Dim,Dim) = Scalar(1);
    return res;
  }

  /** Concatenates a translation and an affine transformation */
-  inline TransformType operator* (const TransformType& t) const;
+  template<int Mode>
+  inline Transform<Scalar,Dim,Mode> operator* (const Transform<Scalar,Dim,Mode>& t) const
+  {
+    Transform<Scalar,Dim,Mode> res = t;
+    res.pretranslate(m_coeffs);
+    return res;
+  }

  /** Applies translation to vector */
  inline VectorType operator* (const VectorType& other) const
@@ -160,39 +167,30 @@ typedef Translation<float, 3> Translation3f;
 typedef Translation<double,3> Translation3d;
 //@}

-
 template<typename Scalar, int Dim>
-inline typename Translation<Scalar,Dim>::TransformType
-Translation<Scalar,Dim>::operator* (const ScalingType& other) const
+inline typename Translation<Scalar,Dim>::AffineTransformType
+Translation<Scalar,Dim>::operator* (const UniformScaling<Scalar>& other) const
 {
-  TransformType res;
+  AffineTransformType res;
  res.matrix().setZero();
-  res.linear().diagonal() = other.coeffs();
+  res.linear().diagonal().fill(other.factor());
  res.translation() = m_coeffs;
  res(Dim,Dim) = Scalar(1);
  return res;
 }

 template<typename Scalar, int Dim>
-inline typename Translation<Scalar,Dim>::TransformType
-Translation<Scalar,Dim>::operator* (const LinearMatrixType& linear) const
+template<typename OtherDerived>
+inline typename Translation<Scalar,Dim>::AffineTransformType
+Translation<Scalar,Dim>::operator* (const MatrixBase<OtherDerived>& linear) const
 {
-  TransformType res;
+  AffineTransformType res;
  res.matrix().setZero();
-  res.linear() = linear;
+  res.linear() = linear.derived();
  res.translation() = m_coeffs;
  res.matrix().row(Dim).setZero();
  res(Dim,Dim) = Scalar(1);
  return res;
 }

-template<typename Scalar, int Dim>
-inline typename Translation<Scalar,Dim>::TransformType
-Translation<Scalar,Dim>::operator* (const TransformType& t) const
-{
-  TransformType res = t;
-  res.pretranslate(m_coeffs);
-  return res;
-}
-
 #endif // EIGEN_TRANSLATION_H
--- a/Eigen/src/Geometry/arch/CMakeLists.txt
+++ b/Eigen/src/Geometry/arch/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_Geometry_arch_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_Geometry_arch_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Geometry/arch COMPONENT Devel
+  )
--- a/Eigen/src/Geometry/arch/Geometry_SSE.h
+++ b/Eigen/src/Geometry/arch/Geometry_SSE.h
@@ -0,0 +1,63 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2009 Rohit Garg <rpg.314@gmail.com>
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_GEOMETRY_SSE_H
+#define EIGEN_GEOMETRY_SSE_H
+
+template<> inline Quaternion<float>
+ei_quaternion_product<EiArch_SSE,float>(const Quaternion<float>& _a, const Quaternion<float>& _b)
+{
+  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0,0,0,0x80000000));
+  Quaternion<float> res;
+  __m128 a = _a.coeffs().packet<Aligned>(0);
+  __m128 b = _b.coeffs().packet<Aligned>(0);
+  __m128 flip1 = _mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a,1,2,0,2),
+                                       ei_vec4f_swizzle1(b,2,0,1,2)),mask);
+  __m128 flip2 = _mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a,3,3,3,1),
+                                       ei_vec4f_swizzle1(b,0,1,2,1)),mask);
+  ei_pstore(&res.x(),
+            _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,ei_vec4f_swizzle1(b,3,3,3,3)),
+                                  _mm_mul_ps(ei_vec4f_swizzle1(a,2,0,1,0),
+                                             ei_vec4f_swizzle1(b,1,2,0,0))),
+                       _mm_add_ps(flip1,flip2)));
+  return res;
+}
+
+template<typename VectorLhs,typename VectorRhs>
+struct ei_cross3_impl<EiArch_SSE,VectorLhs,VectorRhs,float,true> {
+  inline static typename ei_plain_matrix_type<VectorLhs>::type
+  run(const VectorLhs& lhs, const VectorRhs& rhs)
+  {
+    __m128 a = lhs.coeffs().packet<VectorLhs::Flags&AlignedBit ? Aligned : Unaligned>(0);
+    __m128 b = rhs.coeffs().packet<VectorRhs::Flags&AlignedBit ? Aligned : Unaligned>(0);
+    __m128 mul1=_mm_mul_ps(ei_vec4f_swizzle1(a,1,2,0,3),ei_vec4f_swizzle1(b,2,0,1,3));
+    __m128 mul2=_mm_mul_ps(ei_vec4f_swizzle1(a,2,0,1,3),ei_vec4f_swizzle1(b,1,2,0,3));
+    typename ei_plain_matrix_type<VectorLhs>::type res;
+    ei_pstore(&res.x(),_mm_sub_ps(mul1,mul2));
+    return res;
+  }
+};
+
+#endif // EIGEN_GEOMETRY_SSE_H
--- a/Eigen/src/LU/CMakeLists.txt
+++ b/Eigen/src/LU/CMakeLists.txt
@@ -2,5 +2,5 @@ FILE(GLOB Eigen_LU_SRCS "*.h")

 INSTALL(FILES 
  ${Eigen_LU_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/LU
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/LU COMPONENT Devel
  )
--- a/Eigen/src/LU/Determinant.h
+++ b/Eigen/src/LU/Determinant.h
@@ -51,7 +51,7 @@ template<typename Derived,
 {
  static inline typename ei_traits<Derived>::Scalar run(const Derived& m)
  {
-    return m.lu().determinant();
+    return m.partialLu().determinant();
  }
 };

--- a/Eigen/src/LU/Inverse.h
+++ b/Eigen/src/LU/Inverse.h
@@ -170,8 +170,7 @@ struct ei_compute_inverse
 {
  static inline void run(const MatrixType& matrix, MatrixType* result)
  {
-    LU<MatrixType> lu(matrix);
-    lu.computeInverse(result);
+    matrix.partialLu().computeInverse(result);
  }
 };

--- a/Eigen/src/LU/LU.h
+++ b/Eigen/src/LU/LU.h
@@ -323,6 +323,7 @@ template<typename MatrixType> class LU
    IntRowVectorType m_q;
    int m_det_pq;
    int m_rank;
+    RealScalar m_precision;
 };

 template<typename MatrixType>
@@ -332,9 +333,13 @@ LU<MatrixType>::LU(const MatrixType& matrix)
    m_p(matrix.rows()),
    m_q(matrix.cols())
 {
-  const int size = matrix.diagonal().size();
+  const int size = matrix.diagonalSize();
  const int rows = matrix.rows();
  const int cols = matrix.cols();
+  
+  // this formula comes from experimenting (see "LU precision tuning" thread on the list)
+  // and turns out to be identical to Higham's formula used already in LDLt.
+  m_precision = machine_epsilon<Scalar>() * size;

  IntColVectorType rows_transpositions(matrix.rows());
  IntRowVectorType cols_transpositions(matrix.cols());
@@ -355,7 +360,7 @@ LU<MatrixType>::LU(const MatrixType& matrix)
    if(k==0) biggest = biggest_in_corner;

    // if the corner is negligible, then we have less than full rank, and we can finish early
-    if(ei_isMuchSmallerThan(biggest_in_corner, biggest))
+    if(ei_isMuchSmallerThan(biggest_in_corner, biggest, m_precision))
    {
      m_rank = k;
      for(int i = k; i < size; i++)
@@ -397,7 +402,7 @@ LU<MatrixType>::LU(const MatrixType& matrix)
 template<typename MatrixType>
 typename ei_traits<MatrixType>::Scalar LU<MatrixType>::determinant() const
 {
-  return Scalar(m_det_pq) * m_lu.diagonal().redux(ei_scalar_product_op<Scalar>());
+  return Scalar(m_det_pq) * m_lu.diagonal().prod();
 }

 template<typename MatrixType>
@@ -506,7 +511,7 @@ bool LU<MatrixType>::solve(
    RealScalar biggest_in_c = c.corner(TopLeft, m_rank, c.cols()).cwise().abs().maxCoeff();
    for(int col = 0; col < c.cols(); ++col)
      for(int row = m_rank; row < c.rows(); ++row)
-        if(!ei_isMuchSmallerThan(c.coeff(row,col), biggest_in_c))
+        if(!ei_isMuchSmallerThan(c.coeff(row,col), biggest_in_c, m_precision))
          return false;
  }
  m_lu.corner(TopLeft, m_rank, m_rank)
--- a/Eigen/src/LU/PartialLU.h
+++ b/Eigen/src/LU/PartialLU.h
@@ -0,0 +1,262 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_PARTIALLU_H
+#define EIGEN_PARTIALLU_H
+
+/** \ingroup LU_Module
+  *
+  * \class PartialLU
+  *
+  * \brief LU decomposition of a matrix with partial pivoting, and related features
+  *
+  * \param MatrixType the type of the matrix of which we are computing the LU decomposition
+  *
+  * This class represents a LU decomposition of a \b square \b invertible matrix, with partial pivoting: the matrix A
+  * is decomposed as A = PLU where L is unit-lower-triangular, U is upper-triangular, and P
+  * is a permutation matrix.
+  *
+  * Typically, partial pivoting LU decomposition is only considered numerically stable for square invertible matrices.
+  * So in this class, we plainly require that and take advantage of that to do some simplifications and optimizations.
+  * This class will assert that the matrix is square, but it won't (actually it can't) check that the matrix is invertible:
+  * it is your task to check that you only use this decomposition on invertible matrices.
+  *
+  * The guaranteed safe alternative, working for all matrices, is the full pivoting LU decomposition, provided by class LU.
+  *
+  * This is \b not a rank-revealing LU decomposition. Many features are intentionally absent from this class,
+  * such as rank computation. If you need these features, use class LU.
+  *
+  * This LU decomposition is suitable to invert invertible matrices. It is what MatrixBase::inverse() uses. On the other hand,
+  * it is \b not suitable to determine whether a given matrix is invertible.
+  *
+  * The data of the LU decomposition can be directly accessed through the methods matrixLU(), permutationP().
+  *
+  * \sa MatrixBase::partialLu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse(), class LU
+  */
+template<typename MatrixType> class PartialLU
+{
+  public:
+
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+    typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
+    typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
+    typedef Matrix<Scalar, 1, MatrixType::ColsAtCompileTime> RowVectorType;
+    typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> ColVectorType;
+
+    enum { MaxSmallDimAtCompileTime = EIGEN_ENUM_MIN(
+             MatrixType::MaxColsAtCompileTime,
+             MatrixType::MaxRowsAtCompileTime)
+    };
+
+    /** Constructor.
+      *
+      * \param matrix the matrix of which to compute the LU decomposition.
+      *
+      * \warning The matrix should have full rank (e.g. if it's square, it should be invertible).
+      * If you need to deal with non-full rank, use class LU instead.
+      */
+    PartialLU(const MatrixType& matrix);
+
+    /** \returns the LU decomposition matrix: the upper-triangular part is U, the
+      * unit-lower-triangular part is L (at least for square matrices; in the non-square
+      * case, special care is needed, see the documentation of class LU).
+      *
+      * \sa matrixL(), matrixU()
+      */
+    inline const MatrixType& matrixLU() const
+    {
+      return m_lu;
+    }
+
+    /** \returns a vector of integers, whose size is the number of rows of the matrix being decomposed,
+      * representing the P permutation i.e. the permutation of the rows. For its precise meaning,
+      * see the examples given in the documentation of class LU.
+      */
+    inline const IntColVectorType& permutationP() const
+    {
+      return m_p;
+    }
+
+    /** This method finds the solution x to the equation Ax=b, where A is the matrix of which
+      * *this is the LU decomposition. Since if this partial pivoting decomposition the matrix is assumed
+      * to have full rank, such a solution is assumed to exist and to be unique.
+      *
+      * \warning Again, if your matrix may not have full rank, use class LU instead. See LU::solve().
+      *
+      * \param b the right-hand-side of the equation to solve. Can be a vector or a matrix,
+      *          the only requirement in order for the equation to make sense is that
+      *          b.rows()==A.rows(), where A is the matrix of which *this is the LU decomposition.
+      * \param result a pointer to the vector or matrix in which to store the solution, if any exists.
+      *          Resized if necessary, so that result->rows()==A.cols() and result->cols()==b.cols().
+      *          If no solution exists, *result is left with undefined coefficients.
+      *
+      * Example: \include PartialLU_solve.cpp
+      * Output: \verbinclude PartialLU_solve.out
+      *
+      * \sa MatrixBase::solveTriangular(), inverse(), computeInverse()
+      */
+    template<typename OtherDerived, typename ResultType>
+    void solve(const MatrixBase<OtherDerived>& b, ResultType *result) const;
+
+    /** \returns the determinant of the matrix of which
+      * *this is the LU decomposition. It has only linear complexity
+      * (that is, O(n) where n is the dimension of the square matrix)
+      * as the LU decomposition has already been computed.
+      *
+      * \note For fixed-size matrices of size up to 4, MatrixBase::determinant() offers
+      *       optimized paths.
+      *
+      * \warning a determinant can be very big or small, so for matrices
+      * of large enough dimension, there is a risk of overflow/underflow.
+      *
+      * \sa MatrixBase::determinant()
+      */
+    typename ei_traits<MatrixType>::Scalar determinant() const;
+
+    /** Computes the inverse of the matrix of which *this is the LU decomposition.
+      *
+      * \param result a pointer to the matrix into which to store the inverse. Resized if needed.
+      *
+      * \warning The matrix being decomposed here is assumed to be invertible. If you need to check for
+      *          invertibility, use class LU instead.
+      *
+      * \sa MatrixBase::computeInverse(), inverse()
+      */
+    inline void computeInverse(MatrixType *result) const
+    {
+      solve(MatrixType::Identity(m_lu.rows(), m_lu.cols()), result);
+    }
+
+    /** \returns the inverse of the matrix of which *this is the LU decomposition.
+      *
+      * \warning The matrix being decomposed here is assumed to be invertible. If you need to check for
+      *          invertibility, use class LU instead.
+      *
+      * \sa computeInverse(), MatrixBase::inverse()
+      */
+    inline MatrixType inverse() const
+    {
+      MatrixType result;
+      computeInverse(&result);
+      return result;
+    }
+
+  protected:
+    const MatrixType& m_originalMatrix;
+    MatrixType m_lu;
+    IntColVectorType m_p;
+    int m_det_p;
+};
+
+template<typename MatrixType>
+PartialLU<MatrixType>::PartialLU(const MatrixType& matrix)
+  : m_originalMatrix(matrix),
+    m_lu(matrix),
+    m_p(matrix.rows())
+{
+  ei_assert(matrix.rows() == matrix.cols() && "PartialLU is only for square (and moreover invertible) matrices");
+  const int size = matrix.rows();
+
+  IntColVectorType rows_transpositions(size);
+  int number_of_transpositions = 0;
+
+  for(int k = 0; k < size; ++k)
+  {
+    int row_of_biggest_in_col;
+    m_lu.block(k,k,size-k,1).cwise().abs().maxCoeff(&row_of_biggest_in_col);
+    row_of_biggest_in_col += k;
+
+    rows_transpositions.coeffRef(k) = row_of_biggest_in_col;
+
+    if(k != row_of_biggest_in_col) {
+      m_lu.row(k).swap(m_lu.row(row_of_biggest_in_col));
+      ++number_of_transpositions;
+    }
+
+    if(k<size-1) {
+      m_lu.col(k).end(size-k-1) /= m_lu.coeff(k,k);
+      for(int col = k + 1; col < size; ++col)
+        m_lu.col(col).end(size-k-1) -= m_lu.col(k).end(size-k-1) * m_lu.coeff(k,col); 
+    }
+  }
+
+  for(int k = 0; k < size; ++k) m_p.coeffRef(k) = k;
+  for(int k = size-1; k >= 0; --k)
+    std::swap(m_p.coeffRef(k), m_p.coeffRef(rows_transpositions.coeff(k)));
+
+  m_det_p = (number_of_transpositions%2) ? -1 : 1;
+}
+
+template<typename MatrixType>
+typename ei_traits<MatrixType>::Scalar PartialLU<MatrixType>::determinant() const
+{
+  return Scalar(m_det_p) * m_lu.diagonal().prod();
+}
+
+template<typename MatrixType>
+template<typename OtherDerived, typename ResultType>
+void PartialLU<MatrixType>::solve(
+  const MatrixBase<OtherDerived>& b,
+  ResultType *result
+) const
+{
+  /* The decomposition PA = LU can be rewritten as A = P^{-1} L U.
+   * So we proceed as follows:
+   * Step 1: compute c = Pb.
+   * Step 2: replace c by the solution x to Lx = c. Exists because L is invertible.
+   * Step 3: replace c by the solution x to Ux = c. Check if a solution really exists.
+   */
+
+  const int size = m_lu.rows();
+  ei_assert(b.rows() == size);
+
+  result->resize(size, b.cols());
+
+  // Step 1
+  for(int i = 0; i < size; ++i) result->row(m_p.coeff(i)) = b.row(i);
+
+  // Step 2
+  m_lu.template marked<UnitLowerTriangular>()
+      .solveTriangularInPlace(*result);
+
+  // Step 3
+  m_lu.template marked<UpperTriangular>()
+      .solveTriangularInPlace(*result);
+}
+
+/** \lu_module
+  *
+  * \return the LU decomposition of \c *this.
+  *
+  * \sa class LU
+  */
+template<typename Derived>
+inline const PartialLU<typename MatrixBase<Derived>::PlainMatrixType>
+MatrixBase<Derived>::partialLu() const
+{
+  return PartialLU<PlainMatrixType>(eval());
+}
+
+#endif // EIGEN_PARTIALLU_H
--- a/Eigen/src/LeastSquares/CMakeLists.txt
+++ b/Eigen/src/LeastSquares/CMakeLists.txt
@@ -2,5 +2,5 @@ FILE(GLOB Eigen_LeastSquares_SRCS "*.h")

 INSTALL(FILES 
  ${Eigen_LeastSquares_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/LeastSquares
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/LeastSquares COMPONENT Devel
  )
--- a/Eigen/src/LeastSquares/LeastSquares.h
+++ b/Eigen/src/LeastSquares/LeastSquares.h
@@ -160,7 +160,6 @@ void fitHyperplane(int numPoints,

  // compute the covariance matrix
  CovMatrixType covMat = CovMatrixType::Zero(size, size);
-  VectorType remean = VectorType::Zero(size);
  for(int i = 0; i < numPoints; ++i)
  {
    VectorType diff = (*(points[i]) - mean).conjugate();
--- a/Eigen/src/QR/CMakeLists.txt
+++ b/Eigen/src/QR/CMakeLists.txt
@@ -2,5 +2,5 @@ FILE(GLOB Eigen_QR_SRCS "*.h")

 INSTALL(FILES
  ${Eigen_QR_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/QR
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/QR COMPONENT Devel
  )
--- a/Eigen/src/QR/HessenbergDecomposition.h
+++ b/Eigen/src/QR/HessenbergDecomposition.h
@@ -58,10 +58,10 @@ template<typename _MatrixType> class HessenbergDecomposition
    typedef Matrix<RealScalar, Size, 1> DiagonalType;
    typedef Matrix<RealScalar, SizeMinusOne, 1> SubDiagonalType;

-    typedef typename NestByValue<DiagonalCoeffs<MatrixType> >::RealReturnType DiagonalReturnType;
+    typedef typename NestByValue<Diagonal<MatrixType,0> >::RealReturnType DiagonalReturnType;

-    typedef typename NestByValue<DiagonalCoeffs<
-        NestByValue<Block<MatrixType,SizeMinusOne,SizeMinusOne> > > >::RealReturnType SubDiagonalReturnType;
+    typedef typename NestByValue<Diagonal<
+        NestByValue<Block<MatrixType,SizeMinusOne,SizeMinusOne> >,0 > >::RealReturnType SubDiagonalReturnType;

    /** This constructor initializes a HessenbergDecomposition object for
      * further use with HessenbergDecomposition::compute()
--- a/Eigen/src/QR/QR.h
+++ b/Eigen/src/QR/QR.h
@@ -55,12 +55,64 @@ template<typename MatrixType> class QR
    {
      _compute(matrix);
    }
-
-    /** \returns whether or not the matrix is of full rank */
-    bool isFullRank() const { return rank() == std::min(m_qr.rows(),m_qr.cols()); }
    
+    /** \deprecated use isInjective()
+      * \returns whether or not the matrix is of full rank
+      *
+      * \note Since the rank is computed only once, i.e. the first time it is needed, this
+      *       method almost does not perform any further computation.
+      */
+    EIGEN_DEPRECATED bool isFullRank() const { return rank() == m_qr.cols(); }
+    
+    /** \returns the rank of the matrix of which *this is the QR decomposition.
+      *
+      * \note Since the rank is computed only once, i.e. the first time it is needed, this
+      *       method almost does not perform any further computation.
+      */
    int rank() const;
+    
+    /** \returns the dimension of the kernel of the matrix of which *this is the QR decomposition.
+      *
+      * \note Since the rank is computed only once, i.e. the first time it is needed, this
+      *       method almost does not perform any further computation.
+      */
+    inline int dimensionOfKernel() const
+    {
+      return m_qr.cols() - rank();
+    }
+    
+    /** \returns true if the matrix of which *this is the QR decomposition represents an injective
+      *          linear map, i.e. has trivial kernel; false otherwise.
+      *
+      * \note Since the rank is computed only once, i.e. the first time it is needed, this
+      *       method almost does not perform any further computation.
+      */
+    inline bool isInjective() const
+    {
+      return rank() == m_qr.cols();
+    }
+    
+    /** \returns true if the matrix of which *this is the QR decomposition represents a surjective
+      *          linear map; false otherwise.
+      *
+      * \note Since the rank is computed only once, i.e. the first time it is needed, this
+      *       method almost does not perform any further computation.
+      */
+    inline bool isSurjective() const
+    {
+      return rank() == m_qr.rows();
+    }

+    /** \returns true if the matrix of which *this is the QR decomposition is invertible.
+      *
+      * \note Since the rank is computed only once, i.e. the first time it is needed, this
+      *       method almost does not perform any further computation.
+      */
+    inline bool isInvertible() const
+    {
+      return isInjective() && isSurjective();
+    }
+    
    /** \returns a read-only expression of the matrix R of the actual the QR decomposition */
    const Part<NestByValue<MatrixRBlockType>, UpperTriangular>
    matrixR(void) const
@@ -69,6 +121,32 @@ template<typename MatrixType> class QR
      return MatrixRBlockType(m_qr, 0, 0, cols, cols).nestByValue().template part<UpperTriangular>();
    }

+    /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
+      * *this is the QR decomposition, if any exists.
+      *
+      * \param b the right-hand-side of the equation to solve.
+      *
+      * \param result a pointer to the vector/matrix in which to store the solution, if any exists.
+      *          Resized if necessary, so that result->rows()==A.cols() and result->cols()==b.cols().
+      *          If no solution exists, *result is left with undefined coefficients.
+      *
+      * \returns true if any solution exists, false if no solution exists.
+      *
+      * \note If there exist more than one solution, this method will arbitrarily choose one.
+      *       If you need a complete analysis of the space of solutions, take the one solution obtained
+      *       by this method and add to it elements of the kernel, as determined by kernel().
+      *
+      * \note The case where b is a matrix is not yet implemented. Also, this
+      *       code is space inefficient.
+      *
+      * Example: \include QR_solve.cpp
+      * Output: \verbinclude QR_solve.out
+      *
+      * \sa MatrixBase::solveTriangular(), kernel(), computeKernel(), inverse(), computeInverse()
+      */
+    template<typename OtherDerived, typename ResultType>
+    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const;
+
    MatrixType matrixQ(void) const;

  private:
@@ -88,12 +166,11 @@ int QR<MatrixType>::rank() const
 {
  if (!m_rankIsUptodate)
  {
-    RealScalar maxCoeff = m_qr.diagonal().maxCoeff();
-    int n = std::min(m_qr.rows(),m_qr.cols());
-    m_rank = n;
-    for (int i=0; i<n; ++i)
-      if (ei_isMuchSmallerThan(m_qr.diagonal().coeff(i), maxCoeff))
-        --m_rank;
+    RealScalar maxCoeff = m_qr.diagonal().cwise().abs().maxCoeff();
+    int n = m_qr.cols();
+    m_rank = 0;
+    while(m_rank<n && !ei_isMuchSmallerThan(m_qr.diagonal().coeff(m_rank), maxCoeff))
+      ++m_rank;
    m_rankIsUptodate = true;
  }
  return m_rank;
@@ -108,6 +185,7 @@ void QR<MatrixType>::_compute(const MatrixType& matrix)
  m_qr = matrix;
  int rows = matrix.rows();
  int cols = matrix.cols();
+  RealScalar eps2 = precision<RealScalar>()*precision<RealScalar>();

  for (int k = 0; k < cols; ++k)
  {
@@ -132,7 +210,8 @@ void QR<MatrixType>::_compute(const MatrixType& matrix)
        m_hCoeffs.coeffRef(k) = 0;
      }
    }
-    else if ( (!ei_isMuchSmallerThan(beta=m_qr.col(k).end(remainingSize-1).squaredNorm(),static_cast<Scalar>(1))) || ei_imag(v0)==0 )
+    else if ((beta=m_qr.col(k).end(remainingSize-1).squaredNorm())>eps2)
+    // FIXME what about ei_imag(v0) ??
    {
      // form k-th Householder vector
      beta = ei_sqrt(ei_abs2(v0)+beta);
@@ -160,9 +239,40 @@ void QR<MatrixType>::_compute(const MatrixType& matrix)
  }
 }

+template<typename MatrixType>
+template<typename OtherDerived, typename ResultType>
+bool QR<MatrixType>::solve(
+  const MatrixBase<OtherDerived>& b,
+  ResultType *result
+) const
+{
+  const int rows = m_qr.rows();
+  ei_assert(b.rows() == rows);
+  result->resize(rows, b.cols());
+  
+  // TODO(keir): There is almost certainly a faster way to multiply by
+  // Q^T without explicitly forming matrixQ(). Investigate.
+  *result = matrixQ().transpose()*b;
+  
+  if(!isSurjective())
+  {
+    // is result is in the image of R ?
+    RealScalar biggest_in_res = result->corner(TopLeft, m_rank, result->cols()).cwise().abs().maxCoeff();
+    for(int col = 0; col < result->cols(); ++col)
+      for(int row = m_rank; row < result->rows(); ++row)
+        if(!ei_isMuchSmallerThan(result->coeff(row,col), biggest_in_res))
+          return false;
+  }
+  m_qr.corner(TopLeft, m_rank, m_rank)
+      .template marked<UpperTriangular>()
+      .solveTriangularInPlace(result->corner(TopLeft, m_rank, result->cols()));
+
+  return true;
+}
+
 /** \returns the matrix Q */
 template<typename MatrixType>
-MatrixType QR<MatrixType>::matrixQ(void) const
+MatrixType QR<MatrixType>::matrixQ() const
 {
  // compute the product Q_0 Q_1 ... Q_n-1,
  // where Q_k is the k-th Householder transformation I - h_k v_k v_k'
--- a/Eigen/src/QR/SelfAdjointEigenSolver.h
+++ b/Eigen/src/QR/SelfAdjointEigenSolver.h
@@ -52,8 +52,8 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
    typedef Tridiagonalization<MatrixType> TridiagonalizationType;

    SelfAdjointEigenSolver()
-        : m_eivec(Size, Size),
-          m_eivalues(Size)
+        : m_eivec(int(Size), int(Size)),
+          m_eivalues(int(Size))
    {
      ei_assert(Size!=Dynamic);
    }
@@ -189,6 +189,14 @@ void SelfAdjointEigenSolver<MatrixType>::compute(const MatrixType& matrix, bool
  assert(matrix.cols() == matrix.rows());
  int n = matrix.cols();
  m_eivalues.resize(n,1);
+
+  if(n==1)
+  {
+    m_eivalues.coeffRef(0,0) = ei_real(matrix.coeff(0,0));
+    m_eivec.setOnes();
+    return;
+  }
+
  m_eivec = matrix;

  // FIXME, should tridiag be a local variable of this function or an attribute of SelfAdjointEigenSolver ?
--- a/Eigen/src/QR/Tridiagonalization.h
+++ b/Eigen/src/QR/Tridiagonalization.h
@@ -60,10 +60,10 @@ template<typename _MatrixType> class Tridiagonalization
    typedef Matrix<RealScalar, Size, 1> DiagonalType;
    typedef Matrix<RealScalar, SizeMinusOne, 1> SubDiagonalType;

-    typedef typename NestByValue<DiagonalCoeffs<MatrixType> >::RealReturnType DiagonalReturnType;
+    typedef typename NestByValue<Diagonal<MatrixType,0> >::RealReturnType DiagonalReturnType;

-    typedef typename NestByValue<DiagonalCoeffs<
-        NestByValue<Block<MatrixType,SizeMinusOne,SizeMinusOne> > > >::RealReturnType SubDiagonalReturnType;
+    typedef typename NestByValue<Diagonal<
+        NestByValue<Block<MatrixType,SizeMinusOne,SizeMinusOne> >,0 > >::RealReturnType SubDiagonalReturnType;

    /** This constructor initializes a Tridiagonalization object for
      * further use with Tridiagonalization::compute()
@@ -73,8 +73,7 @@ template<typename _MatrixType> class Tridiagonalization
    {}

    Tridiagonalization(const MatrixType& matrix)
-      : m_matrix(matrix),
-        m_hCoeffs(matrix.cols()-1)
+      : m_matrix(matrix), m_hCoeffs(matrix.cols()-1)
    {
      _compute(m_matrix, m_hCoeffs);
    }
@@ -122,13 +121,12 @@ template<typename _MatrixType> class Tridiagonalization

    static void decomposeInPlace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ = true);

-  private:
-
    static void _compute(MatrixType& matA, CoeffVectorType& hCoeffs);

+  protected:
+
    static void _decomposeInPlace3x3(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ = true);

-  protected:
    MatrixType m_matrix;
    CoeffVectorType m_hCoeffs;
 };
@@ -192,7 +190,6 @@ void Tridiagonalization<MatrixType>::_compute(MatrixType& matA, CoeffVectorType&
 {
  assert(matA.rows()==matA.cols());
  int n = matA.rows();
-//   std::cerr << matA << "\n\n";
  for (int i = 0; i<n-2; ++i)
  {
    // let's consider the vector v = i-th column starting at position i+1
@@ -221,99 +218,20 @@ void Tridiagonalization<MatrixType>::_compute(MatrixType& matA, CoeffVectorType&

      matA.col(i).coeffRef(i+1) = 1;

-      /* This is the initial algorithm which minimize operation counts and maximize
-       * the use of Eigen's expression. Unfortunately, the first matrix-vector product
-       * using Part<LowerTriangular|Selfadjoint>  is very very slow */
-      #ifdef EIGEN_NEVER_DEFINED
-      // matrix - vector product
-      hCoeffs.end(n-i-1) = (matA.corner(BottomRight,n-i-1,n-i-1).template part<LowerTriangular|SelfAdjoint>()
-                                * (h * matA.col(i).end(n-i-1))).lazy();
-      // simple axpy
-      hCoeffs.end(n-i-1) += (h * Scalar(-0.5) * matA.col(i).end(n-i-1).dot(hCoeffs.end(n-i-1)))
-                            * matA.col(i).end(n-i-1);
-      // rank-2 update
-      //Block<MatrixType,Dynamic,1> B(matA,i+1,i,n-i-1,1);
-      matA.corner(BottomRight,n-i-1,n-i-1).template part<LowerTriangular>() -=
-            (matA.col(i).end(n-i-1) * hCoeffs.end(n-i-1).adjoint()).lazy()
-          + (hCoeffs.end(n-i-1) * matA.col(i).end(n-i-1).adjoint()).lazy();
-      #endif
-      /* end initial algorithm */
+      // hCoeffs.end(n-i-1) = (matA.corner(BottomRight,n-i-1,n-i-1).template part<LowerTriangular|SelfAdjoint>()
+      //                    *  matA.col(i).end(n-i-1)).lazy();
+      // TODO map the above code to the function call below:
+      ei_product_selfadjoint_vector<Scalar,MatrixType::Flags&RowMajorBit,LowerTriangularBit>
+        (n-i-1,matA.corner(BottomRight,n-i-1,n-i-1).data(), matA.stride(), matA.col(i).end(n-i-1).data(), const_cast<Scalar*>(hCoeffs.end(n-i-1).data()));

-      /* If we still want to minimize operation count (i.e., perform operation on the lower part only)
-       * then we could provide the following algorithm for selfadjoint - vector product. However, a full
-       * matrix-vector product is still faster (at least for dynamic size, and not too small, did not check
-       * small matrices). The algo performs block matrix-vector and transposed matrix vector products. */
-      #ifdef EIGEN_NEVER_DEFINED
-      int n4 = (std::max(0,n-4)/4)*4;
-      hCoeffs.end(n-i-1).setZero();
-      for (int b=i+1; b<n4; b+=4)
-      {
-        // the ?x4 part:
-        hCoeffs.end(b-4) +=
-            Block<MatrixType,Dynamic,4>(matA,b+4,b,n-b-4,4) * matA.template block<4,1>(b,i);
-        // the respective transposed part:
-        Block<CoeffVectorType,4,1>(hCoeffs, b, 0, 4,1) +=
-            Block<MatrixType,Dynamic,4>(matA,b+4,b,n-b-4,4).adjoint() * Block<MatrixType,Dynamic,1>(matA,b+4,i,n-b-4,1);
-        // the 4x4 block diagonal:
-        Block<CoeffVectorType,4,1>(hCoeffs, b, 0, 4,1) +=
-            (Block<MatrixType,4,4>(matA,b,b,4,4).template part<LowerTriangular|SelfAdjoint>()
-             * (h * Block<MatrixType,4,1>(matA,b,i,4,1))).lazy();
-      }
-      #endif
-      // todo: handle the remaining part
-      /* end optimized selfadjoint - vector product */
+      hCoeffs.end(n-i-1) = hCoeffs.end(n-i-1)*h
+                         + (h*ei_conj(h)*Scalar(-0.5)*(matA.col(i).end(n-i-1).dot(hCoeffs.end(n-i-1)))) *
+                           matA.col(i).end(n-i-1);

-      /* Another interesting note: the above rank-2 update is much slower than the following hand written loop.
-       * After an analyze of the ASM, it seems GCC (4.2) generate poor code because of the Block. Moreover,
-       * if we remove the specialization of Block for Matrix then it is even worse, much worse ! */
-      #ifdef EIGEN_NEVER_DEFINED
+      // symmetric rank-2 update
      for (int j1=i+1; j1<n; ++j1)
-      for (int i1=j1;  i1<n; ++i1)
-        matA.coeffRef(i1,j1) -= matA.coeff(i1,i)*ei_conj(hCoeffs.coeff(j1-1))
-                              + hCoeffs.coeff(i1-1)*ei_conj(matA.coeff(j1,i));
-      #endif
-      /* end hand writen partial rank-2 update */
-
-      /* The current fastest implementation: the full matrix is used, no "optimization" to use/compute
-       * only half of the matrix. Custom vectorization of the inner col -= alpha X + beta Y such that access
-       * to col are always aligned. Once we support that in Assign, then the algorithm could be rewriten as
-       * a single compact expression. This code is therefore a good benchmark when will do that. */
-
-      // let's use the end of hCoeffs to store temporary values:
-      hCoeffs.end(n-i-1) = (matA.corner(BottomRight,n-i-1,n-i-1) * (h * matA.col(i).end(n-i-1))).lazy();
-      // FIXME in the above expr a temporary is created because of the scalar multiple by h
-
-      hCoeffs.end(n-i-1) += (h * Scalar(-0.5) * matA.col(i).end(n-i-1).dot(hCoeffs.end(n-i-1)))
-                            * matA.col(i).end(n-i-1);
-
-      const Scalar* EIGEN_RESTRICT pb = &matA.coeffRef(0,i);
-      const Scalar* EIGEN_RESTRICT pa = (&hCoeffs.coeffRef(0)) - 1;
-      for (int j1=i+1; j1<n; ++j1)
-      {
-        int starti = i+1;
-        int alignedEnd = starti;
-        if (PacketSize>1)
-        {
-          int alignedStart = (starti) + ei_alignmentOffset(&matA.coeffRef(starti,j1), n-starti);
-          alignedEnd = alignedStart + ((n-alignedStart)/PacketSize)*PacketSize;
-
-          for (int i1=starti; i1<alignedStart; ++i1)
-            matA.coeffRef(i1,j1) -= matA.coeff(i1,i)*ei_conj(hCoeffs.coeff(j1-1))
-                                  + hCoeffs.coeff(i1-1)*ei_conj(matA.coeff(j1,i));
-
-          Packet tmp0 = ei_pset1(hCoeffs.coeff(j1-1));
-          Packet tmp1 = ei_pset1(matA.coeff(j1,i));
-          Scalar* pc = &matA.coeffRef(0,j1);
-          for (int i1=alignedStart ; i1<alignedEnd; i1+=PacketSize)
-            ei_pstore(pc+i1,ei_psub(ei_pload(pc+i1),
-              ei_padd(ei_pmul(tmp0, ei_ploadu(pb+i1)),
-                      ei_pmul(tmp1, ei_ploadu(pa+i1)))));
-        }
-        for (int i1=alignedEnd; i1<n; ++i1)
-          matA.coeffRef(i1,j1) -= matA.coeff(i1,i)*ei_conj(hCoeffs.coeff(j1-1))
-                                + hCoeffs.coeff(i1-1)*ei_conj(matA.coeff(j1,i));
-      }
-      /* end optimized implementation */
+        matA.col(j1).end(n-j1) -= matA.col(i).end(n-j1) * ei_conj(hCoeffs.coeff(j1-1))
+                                + hCoeffs.end(n-j1) * ei_conj(matA.coeff(j1,i));

      // note: at that point matA(i+1,i+1) is the (i+1)-th element of the final diagonal
      // note: the sequence of the beta values leads to the subdiagonal entries
--- a/Eigen/src/SVD/CMakeLists.txt
+++ b/Eigen/src/SVD/CMakeLists.txt
@@ -2,5 +2,5 @@ FILE(GLOB Eigen_SVD_SRCS "*.h")

 INSTALL(FILES
  ${Eigen_SVD_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/SVD
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/SVD COMPONENT Devel
  )
--- a/Eigen/src/Sparse/AmbiVector.h
+++ b/Eigen/src/Sparse/AmbiVector.h
@@ -36,7 +36,7 @@ template<typename _Scalar> class AmbiVector
    typedef _Scalar Scalar;
    typedef typename NumTraits<Scalar>::Real RealScalar;
    AmbiVector(int size)
-      : m_buffer(0), m_size(0), m_allocatedSize(0), m_allocatedElements(0), m_mode(-1)
+      : m_buffer(0), m_zero(0), m_size(0), m_allocatedSize(0), m_allocatedElements(0), m_mode(-1)
    {
      resize(size);
    }
@@ -44,7 +44,7 @@ template<typename _Scalar> class AmbiVector
    void init(RealScalar estimatedDensity);
    void init(int mode);

-    void nonZeros() const;
+    int nonZeros() const;

    /** Specifies a sub-vector to work on */
    void setBounds(int start, int end) { m_start = start; m_end = end; }
@@ -53,7 +53,7 @@ template<typename _Scalar> class AmbiVector

    void restart();
    Scalar& coeffRef(int i);
-    Scalar coeff(int i);
+    Scalar& coeff(int i);

    class Iterator;

@@ -112,6 +112,7 @@ template<typename _Scalar> class AmbiVector

    // used to store data in both mode
    Scalar* m_buffer;
+    Scalar m_zero;
    int m_size;
    int m_start;
    int m_end;
@@ -131,7 +132,7 @@ template<typename _Scalar> class AmbiVector

 /** \returns the number of non zeros in the current sub vector */
 template<typename Scalar>
-void AmbiVector<Scalar>::nonZeros() const
+int AmbiVector<Scalar>::nonZeros() const
 {
  if (m_mode==IsSparse)
    return m_llSize;
@@ -254,7 +255,7 @@ Scalar& AmbiVector<Scalar>::coeffRef(int i)
 }

 template<typename Scalar>
-Scalar AmbiVector<Scalar>::coeff(int i)
+Scalar& AmbiVector<Scalar>::coeff(int i)
 {
  if (m_mode==IsDense)
    return m_buffer[i];
@@ -264,7 +265,7 @@ Scalar AmbiVector<Scalar>::coeff(int i)
    ei_assert(m_mode==IsSparse);
    if ((m_llSize==0) || (i<llElements[m_llStart].index))
    {
-      return Scalar(0);
+      return m_zero;
    }
    else
    {
@@ -275,7 +276,7 @@ Scalar AmbiVector<Scalar>::coeff(int i)
      if (llElements[elid].index==i)
        return llElements[m_llCurrent].value;
      else
-        return Scalar(0);
+        return m_zero;
    }
  }
 }
--- a/Eigen/src/Sparse/CMakeLists.txt
+++ b/Eigen/src/Sparse/CMakeLists.txt
@@ -2,5 +2,5 @@ FILE(GLOB Eigen_Sparse_SRCS "*.h")

 INSTALL(FILES 
  ${Eigen_Sparse_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Sparse
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Sparse COMPONENT Devel
  )
--- a/Eigen/src/Sparse/CholmodSupport.h
+++ b/Eigen/src/Sparse/CholmodSupport.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra. Eigen itself is part of the KDE project.
 //
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
 //
 // Eigen is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
@@ -54,16 +54,17 @@ void ei_cholmod_configure_matrix(CholmodType& mat)
  }
 }

-template<typename Scalar, int Flags>
-cholmod_sparse SparseMatrixBase<Scalar,Flags>::asCholmodMatrix()
+template<typename Derived>
+cholmod_sparse SparseMatrixBase<Derived>::asCholmodMatrix()
 {
+  typedef typename Derived::Scalar Scalar;
  cholmod_sparse res;
  res.nzmax   = nonZeros();
  res.nrow    = rows();;
  res.ncol    = cols();
-  res.p       = _outerIndexPtr();
-  res.i       = _innerIndexPtr();
-  res.x       = _valuePtr();
+  res.p       = derived()._outerIndexPtr();
+  res.i       = derived()._innerIndexPtr();
+  res.x       = derived()._valuePtr();
  res.xtype = CHOLMOD_REAL;
  res.itype = CHOLMOD_INT;
  res.sorted = 1;
@@ -73,11 +74,11 @@ cholmod_sparse SparseMatrixBase<Scalar,Flags>::asCholmodMatrix()

  ei_cholmod_configure_matrix<Scalar>(res);

-  if (Flags & SelfAdjoint)
+  if (Derived::Flags & SelfAdjoint)
  {
-    if (Flags & UpperTriangular)
+    if (Derived::Flags & UpperTriangular)
      res.stype = 1;
-    else if (Flags & LowerTriangular)
+    else if (Derived::Flags & LowerTriangular)
      res.stype = -1;
    else
      res.stype = 0;
@@ -108,14 +109,14 @@ cholmod_dense ei_cholmod_map_eigen_to_dense(MatrixBase<Derived>& mat)
 }

 template<typename Scalar, int Flags>
-MappedSparseMatrix<Scalar,Flags>::MappedSparseMatrix(taucs_ccs_matrix& taucsMat)
+MappedSparseMatrix<Scalar,Flags>::MappedSparseMatrix(cholmod_sparse& cm)
 {
  m_innerSize = cm.nrow;
  m_outerSize = cm.ncol;
  m_outerIndex = reinterpret_cast<int*>(cm.p);
  m_innerIndices = reinterpret_cast<int*>(cm.i);
  m_values = reinterpret_cast<Scalar*>(cm.x);
-  m_nnz = res.m_outerIndex[cm.ncol]);
+  m_nnz = m_outerIndex[cm.ncol];
 }

 template<typename MatrixType>
@@ -123,8 +124,8 @@ class SparseLLT<MatrixType,Cholmod> : public SparseLLT<MatrixType>
 {
  protected:
    typedef SparseLLT<MatrixType> Base;
-    using typename Base::Scalar;
-    using Base::RealScalar;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::RealScalar RealScalar;
    using Base::MatrixLIsDirty;
    using Base::SupernodalFactorIsDirty;
    using Base::m_flags;
@@ -205,7 +206,7 @@ SparseLLT<MatrixType,Cholmod>::matrixL() const
    ei_assert(!(m_status & SupernodalFactorIsDirty));

    cholmod_sparse* cmRes = cholmod_factor_to_sparse(m_cholmodFactor, &m_cholmod);
-    const_cast<typename Base::CholMatrixType&>(m_matrix) = Base::CholMatrixType::Map(*cmRes);
+    const_cast<typename Base::CholMatrixType&>(m_matrix) = MappedSparseMatrix<Scalar>(*cmRes);
    free(cmRes);

    m_status = (m_status & ~MatrixLIsDirty);
--- a/Eigen/src/Sparse/CompressedStorage.h
+++ b/Eigen/src/Sparse/CompressedStorage.h
@@ -155,7 +155,7 @@ class CompressedStorage
    /** Like at(), but the search is performed in the range [start,end) */
    inline Scalar atInRange(size_t start, size_t end, int key, Scalar defaultValue = Scalar(0)) const
    {
-      if (start==end)
+      if (start>=end)
        return Scalar(0);
      else if (end>start && key==m_indices[end-1])
        return m_values[end-1];
--- a/Eigen/src/Sparse/DynamicSparseMatrix.h
+++ b/Eigen/src/Sparse/DynamicSparseMatrix.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra. Eigen itself is part of the KDE project.
 //
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
 //
 // Eigen is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
@@ -83,6 +83,9 @@ class DynamicSparseMatrix
    inline int innerSize() const { return m_innerSize; }
    inline int outerSize() const { return m_data.size(); }
    inline int innerNonZeros(int j) const { return m_data[j].size(); }
+    
+    std::vector<CompressedStorage<Scalar> >& _data() { return m_data; }
+    const std::vector<CompressedStorage<Scalar> >& _data() const { return m_data; }

    /** \returns the coefficient value at given position \a row, \a col
      * This operation involes a log(rho*outer_size) binary search.
@@ -107,14 +110,14 @@ class DynamicSparseMatrix

    class InnerIterator;

-    inline void setZero()
+    void setZero()
    {
      for (int j=0; j<outerSize(); ++j)
        m_data[j].clear();
    }

    /** \returns the number of non zero coefficients */
-    inline int nonZeros() const
+    int nonZeros() const
    {
      int res = 0;
      for (int j=0; j<outerSize(); ++j)
@@ -122,18 +125,39 @@ class DynamicSparseMatrix
      return res;
    }

-    /** Set the matrix to zero and reserve the memory for \a reserveSize nonzero coefficients. */
-    inline void startFill(int reserveSize = 1000)
+    /** \deprecated
+      * Set the matrix to zero and reserve the memory for \a reserveSize nonzero coefficients. */
+    EIGEN_DEPRECATED void startFill(int reserveSize = 1000)
    {
-      int reserveSizePerVector = std::max(reserveSize/outerSize(),4);
-      for (int j=0; j<outerSize(); ++j)
+      setZero();
+      reserve(reserveSize);
+    }
+    
+    void reserve(int reserveSize = 1000)
+    {
+      if (outerSize()>0)
      {
-        m_data[j].clear();
-        m_data[j].reserve(reserveSizePerVector);
+        int reserveSizePerVector = std::max(reserveSize/outerSize(),4);
+        for (int j=0; j<outerSize(); ++j)
+        {
+          m_data[j].reserve(reserveSizePerVector);
+        }
      }
    }
+    
+    inline void startVec(int /*outer*/) {}
+    
+    inline Scalar& insertBack(int outer, int inner)
+    {
+      ei_assert(outer<int(m_data.size()) && inner<m_innerSize && "out of range");
+      ei_assert(((m_data[outer].size()==0) || (m_data[outer].index(m_data[outer].size()-1)<inner)) 
+                && "wrong sorted insertion");
+      m_data[outer].append(0, inner);
+      return m_data[outer].value(m_data[outer].size()-1);
+    }

-    /** inserts a nonzero coefficient at given coordinates \a row, \a col and returns its reference assuming that:
+    /** \deprecated use insert()
+      * inserts a nonzero coefficient at given coordinates \a row, \a col and returns its reference assuming that:
      *  1 - the coefficient does not exist yet
      *  2 - this the coefficient with greater inner coordinate for the given outer coordinate.
      * In other words, assuming \c *this is column-major, then there must not exists any nonzero coefficient of coordinates
@@ -141,21 +165,24 @@ class DynamicSparseMatrix
      * 
      * \see fillrand(), coeffRef()
      */
-    inline Scalar& fill(int row, int col)
+    EIGEN_DEPRECATED Scalar& fill(int row, int col)
    {
      const int outer = IsRowMajor ? row : col;
      const int inner = IsRowMajor ? col : row;
-      ei_assert(outer<int(m_data.size()) && inner<m_innerSize);
-      ei_assert((m_data[outer].size()==0) || (m_data[outer].index(m_data[outer].size()-1)<inner));
-      m_data[outer].append(0, inner);
-      return m_data[outer].value(m_data[outer].size()-1);
+      return insertBack(outer,inner);
    }

-    /** Like fill() but with random inner coordinates.
+    /** \deprecated use insert()
+      * Like fill() but with random inner coordinates.
      * Compared to the generic coeffRef(), the unique limitation is that we assume
      * the coefficient does not exist yet.
      */
-    inline Scalar& fillrand(int row, int col)
+    EIGEN_DEPRECATED Scalar& fillrand(int row, int col)
+    {
+      return insert(row,col);
+    }
+    
+    inline Scalar& insert(int row, int col)
    {
      const int outer = IsRowMajor ? row : col;
      const int inner = IsRowMajor ? col : row;
@@ -175,8 +202,11 @@ class DynamicSparseMatrix
      return m_data[outer].value(id+1);
    }

-    /** Does nothing. Provided for compatibility with SparseMatrix. */
-    inline void endFill() {}
+    /** \deprecated use finalize()
+      * Does nothing. Provided for compatibility with SparseMatrix. */
+    EIGEN_DEPRECATED void endFill() {}
+    
+    inline void finalize() {}
    
    void prune(Scalar reference, RealScalar epsilon = precision<RealScalar>())
    {
@@ -215,7 +245,7 @@ class DynamicSparseMatrix
    }

    inline DynamicSparseMatrix()
-      : m_innerSize(0)
+      : m_innerSize(0), m_data(0)
    {
      ei_assert(innerSize()==0 && outerSize()==0);
    }
--- a/Eigen/src/Sparse/MappedSparseMatrix.h
+++ b/Eigen/src/Sparse/MappedSparseMatrix.h
@@ -65,10 +65,10 @@ class MappedSparseMatrix

    //----------------------------------------
    // direct access interface
-    inline const Scalar* _valuePtr() const { return &m_values; }
-    inline Scalar* _valuePtr() { return &m_values; }
+    inline const Scalar* _valuePtr() const { return m_values; }
+    inline Scalar* _valuePtr() { return m_values; }

-    inline const int* _innerIndexPtr() const { return &m_innerIndices; }
+    inline const int* _innerIndexPtr() const { return m_innerIndices; }
    inline int* _innerIndexPtr() { return m_innerIndices; }

    inline const int* _outerIndexPtr() const { return m_outerIndex; }
@@ -77,8 +77,8 @@ class MappedSparseMatrix

    inline Scalar coeff(int row, int col) const
    {
-      const int outer = RowMajor ? row : col;
-      const int inner = RowMajor ? col : row;
+      const int outer = IsRowMajor ? row : col;
+      const int inner = IsRowMajor ? col : row;

      int start = m_outerIndex[outer];
      int end = m_outerIndex[outer+1];
@@ -96,8 +96,8 @@ class MappedSparseMatrix

    inline Scalar& coeffRef(int row, int col)
    {
-      const int outer = RowMajor ? row : col;
-      const int inner = RowMajor ? col : row;
+      const int outer = IsRowMajor ? row : col;
+      const int inner = IsRowMajor ? col : row;

      int start = m_outerIndex[outer];
      int end = m_outerIndex[outer+1];
@@ -108,7 +108,7 @@ class MappedSparseMatrix
      ei_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient");
      return m_values[id];
    }
-    
+
    class InnerIterator;

    /** \returns the number of non zero coefficients */
@@ -140,21 +140,25 @@ class MappedSparseMatrix<Scalar,_Flags>::InnerIterator
 {
  public:
    InnerIterator(const MappedSparseMatrix& mat, int outer)
-      : m_matrix(mat), m_outer(outer), m_id(mat._outerIndexPtr[outer]), m_start(m_id), m_end(mat._outerIndexPtr[outer+1])
+      : m_matrix(mat),
+        m_outer(outer),
+        m_id(mat._outerIndexPtr()[outer]),
+        m_start(m_id),
+        m_end(mat._outerIndexPtr()[outer+1])
    {}

    template<unsigned int Added, unsigned int Removed>
    InnerIterator(const Flagged<MappedSparseMatrix,Added,Removed>& mat, int outer)
-      : m_matrix(mat._expression()), m_id(m_matrix._outerIndexPtr[outer]),
-        m_start(m_id), m_end(m_matrix._outerIndexPtr[outer+1])
+      : m_matrix(mat._expression()), m_id(m_matrix._outerIndexPtr()[outer]),
+        m_start(m_id), m_end(m_matrix._outerIndexPtr()[outer+1])
    {}

    inline InnerIterator& operator++() { m_id++; return *this; }

-    inline Scalar value() const { return m_matrix.m_valuePtr[m_id]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix._valuePtr[m_id]); }
+    inline Scalar value() const { return m_matrix._valuePtr()[m_id]; }
+    inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix._valuePtr()[m_id]); }

-    inline int index() const { return m_matrix._innerIndexPtr(m_id); }
+    inline int index() const { return m_matrix._innerIndexPtr()[m_id]; }
    inline int row() const { return IsRowMajor ? m_outer : index(); }
    inline int col() const { return IsRowMajor ? index() : m_outer; }

--- a/Eigen/src/Sparse/RandomSetter.h
+++ b/Eigen/src/Sparse/RandomSetter.h
@@ -224,19 +224,28 @@ class RandomSetter
      KeyType keyBitsMask = (1<<m_keyBitsOffset)-1;
      if (!SwapStorage) // also means the map is sorted
      {
-        mp_target->startFill(nonZeros());
+        mp_target->setZero();
+        mp_target->reserve(nonZeros());
+        int prevOuter = -1;
        for (int k=0; k<m_outerPackets; ++k)
        {
+          mp_target->startVec(k);
          const int outerOffset = (1<<OuterPacketBits) * k;
          typename HashMapType::iterator end = m_hashmaps[k].end();
          for (typename HashMapType::iterator it = m_hashmaps[k].begin(); it!=end; ++it)
          {
            const int outer = (it->first >> m_keyBitsOffset) + outerOffset;
            const int inner = it->first & keyBitsMask;
-            mp_target->fill(TargetRowMajor ? outer : inner, TargetRowMajor ? inner : outer) = it->second.value;
+            if (prevOuter!=outer)
+            {
+              for (int j=prevOuter+1;j<=outer;++j)
+                mp_target->startVec(j);
+              prevOuter = outer;
+            }
+            mp_target->insertBack(outer, inner) = it->second.value;
          }
        }
-        mp_target->endFill();
+        mp_target->finalize();
      }
      else
      {
--- a/Eigen/src/Sparse/SparseBlock.h
+++ b/Eigen/src/Sparse/SparseBlock.h
@@ -26,70 +26,274 @@
 #ifndef EIGEN_SPARSE_BLOCK_H
 #define EIGEN_SPARSE_BLOCK_H

-template<typename MatrixType>
-struct ei_traits<SparseInnerVector<MatrixType> >
+template<typename MatrixType, int Size>
+struct ei_traits<SparseInnerVectorSet<MatrixType, Size> >
 {
  typedef typename ei_traits<MatrixType>::Scalar Scalar;
  enum {
    IsRowMajor = (int(MatrixType::Flags)&RowMajorBit)==RowMajorBit,
    Flags = MatrixType::Flags,
-    RowsAtCompileTime = IsRowMajor ? 1 : MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = IsRowMajor ? MatrixType::ColsAtCompileTime : 1,
+    RowsAtCompileTime = IsRowMajor ? Size : MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = IsRowMajor ? MatrixType::ColsAtCompileTime : Size,
    CoeffReadCost = MatrixType::CoeffReadCost
  };
 };

-template<typename MatrixType>
-class SparseInnerVector : ei_no_assignment_operator,
-  public SparseMatrixBase<SparseInnerVector<MatrixType> >
+template<typename MatrixType, int Size>
+class SparseInnerVectorSet : ei_no_assignment_operator,
+  public SparseMatrixBase<SparseInnerVectorSet<MatrixType, Size> >
 {
-    enum {
-      IsRowMajor = ei_traits<SparseInnerVector>::IsRowMajor
-    };
-public:
+    enum { IsRowMajor = ei_traits<SparseInnerVectorSet>::IsRowMajor };
+  public:

-    EIGEN_SPARSE_GENERIC_PUBLIC_INTERFACE(SparseInnerVector)
-    class InnerIterator;
-
-    inline SparseInnerVector(const MatrixType& matrix, int outer)
-      : m_matrix(matrix), m_outer(outer)
+    EIGEN_SPARSE_GENERIC_PUBLIC_INTERFACE(SparseInnerVectorSet)
+    class InnerIterator: public MatrixType::InnerIterator
    {
+      public:
+        inline InnerIterator(const SparseInnerVectorSet& xpr, int outer)
+          : MatrixType::InnerIterator(xpr.m_matrix, xpr.m_outerStart + outer)
+        {}
+    };
+
+    inline SparseInnerVectorSet(const MatrixType& matrix, int outerStart, int outerSize)
+      : m_matrix(matrix), m_outerStart(outerStart), m_outerSize(outerSize)
+    {
+      ei_assert( (outerStart>=0) && ((outerStart+outerSize)<=matrix.outerSize()) );
+    }
+
+    inline SparseInnerVectorSet(const MatrixType& matrix, int outer)
+      : m_matrix(matrix), m_outerStart(outer), m_outerSize(Size)
+    {
+      ei_assert(Size!=Dynamic);
      ei_assert( (outer>=0) && (outer<matrix.outerSize()) );
    }

-    EIGEN_STRONG_INLINE int rows() const { return IsRowMajor ? 1 : m_matrix.rows(); }
-    EIGEN_STRONG_INLINE int cols() const { return IsRowMajor ? m_matrix.cols() : 1; }
+//     template<typename OtherDerived>
+//     inline SparseInnerVectorSet& operator=(const SparseMatrixBase<OtherDerived>& other)
+//     {
+//       return *this;
+//     }
+
+//     template<typename Sparse>
+//     inline SparseInnerVectorSet& operator=(const SparseMatrixBase<OtherDerived>& other)
+//     {
+//       return *this;
+//     }
+
+    EIGEN_STRONG_INLINE int rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
+    EIGEN_STRONG_INLINE int cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }

  protected:

    const typename MatrixType::Nested m_matrix;
-    int m_outer;
+    int m_outerStart;
+    const ei_int_if_dynamic<Size> m_outerSize;

 };

-template<typename MatrixType>
-class SparseInnerVector<MatrixType>::InnerIterator : public MatrixType::InnerIterator
+/***************************************************************************
+* specialisation for DynamicSparseMatrix
+***************************************************************************/
+
+template<typename _Scalar, int _Options, int Size>
+class SparseInnerVectorSet<DynamicSparseMatrix<_Scalar, _Options>, Size>
+  : public SparseMatrixBase<SparseInnerVectorSet<DynamicSparseMatrix<_Scalar, _Options>, Size> >
 {
-public:
-  inline InnerIterator(const SparseInnerVector& xpr, int outer=0)
-    : MatrixType::InnerIterator(xpr.m_matrix, xpr.m_outer)
-  {
-    ei_assert(outer==0);
-  }
+    typedef DynamicSparseMatrix<_Scalar, _Options> MatrixType;
+    enum { IsRowMajor = ei_traits<SparseInnerVectorSet>::IsRowMajor };
+  public:
+
+    EIGEN_SPARSE_GENERIC_PUBLIC_INTERFACE(SparseInnerVectorSet)
+    class InnerIterator: public MatrixType::InnerIterator
+    {
+      public:
+        inline InnerIterator(const SparseInnerVectorSet& xpr, int outer)
+          : MatrixType::InnerIterator(xpr.m_matrix, xpr.m_outerStart + outer)
+        {}
+    };
+
+    inline SparseInnerVectorSet(const MatrixType& matrix, int outerStart, int outerSize)
+      : m_matrix(matrix), m_outerStart(outerStart), m_outerSize(outerSize)
+    {
+      ei_assert( (outerStart>=0) && ((outerStart+outerSize)<=matrix.outerSize()) );
+    }
+
+    inline SparseInnerVectorSet(const MatrixType& matrix, int outer)
+      : m_matrix(matrix), m_outerStart(outer), m_outerSize(Size)
+    {
+      ei_assert(Size!=Dynamic);
+      ei_assert( (outer>=0) && (outer<matrix.outerSize()) );
+    }
+
+    template<typename OtherDerived>
+    inline SparseInnerVectorSet& operator=(const SparseMatrixBase<OtherDerived>& other)
+    {
+      if (IsRowMajor != ((OtherDerived::Flags&RowMajorBit)==RowMajorBit))
+      {
+        // need to transpose => perform a block evaluation followed by a big swap
+        DynamicSparseMatrix<Scalar,IsRowMajor?RowMajorBit:0> aux(other);
+        *this = aux.markAsRValue();
+      }
+      else
+      {
+        // evaluate/copy vector per vector
+        for (int j=0; j<m_outerSize.value(); ++j)
+        {
+          SparseVector<Scalar,IsRowMajor ? RowMajorBit : 0> aux(other.innerVector(j));
+          m_matrix.const_cast_derived()._data()[m_outerStart+j].swap(aux._data());
+        }
+      }
+      return *this;
+    }
+
+    inline SparseInnerVectorSet& operator=(const SparseInnerVectorSet& other)
+    {
+      return operator=<SparseInnerVectorSet>(other);
+    }
+    
+    int nonZeros() const
+    { 
+      int count = 0;
+      for (int j=0; j<m_outerSize; ++j)
+        count += m_matrix._data()[m_outerStart+j].size();
+      return count;
+    }
+    
+    const Scalar& lastCoeff() const
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(SparseInnerVectorSet);
+      ei_assert(m_matrix.data()[m_outerStart].size()>0);
+      return m_matrix.data()[m_outerStart].vale(m_matrix.data()[m_outerStart].size()-1);
+    }
+
+//     template<typename Sparse>
+//     inline SparseInnerVectorSet& operator=(const SparseMatrixBase<OtherDerived>& other)
+//     {
+//       return *this;
+//     }
+
+    EIGEN_STRONG_INLINE int rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
+    EIGEN_STRONG_INLINE int cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
+
+  protected:
+
+    const typename MatrixType::Nested m_matrix;
+    int m_outerStart;
+    const ei_int_if_dynamic<Size> m_outerSize;
+
 };

+
+/***************************************************************************
+* specialisation for SparseMatrix
+***************************************************************************/
+
+template<typename _Scalar, int _Options, int Size>
+class SparseInnerVectorSet<SparseMatrix<_Scalar, _Options>, Size>
+  : public SparseMatrixBase<SparseInnerVectorSet<SparseMatrix<_Scalar, _Options>, Size> >
+{
+    typedef SparseMatrix<_Scalar, _Options> MatrixType;
+    enum { IsRowMajor = ei_traits<SparseInnerVectorSet>::IsRowMajor };
+  public:
+
+    EIGEN_SPARSE_GENERIC_PUBLIC_INTERFACE(SparseInnerVectorSet)
+    class InnerIterator: public MatrixType::InnerIterator
+    {
+      public:
+        inline InnerIterator(const SparseInnerVectorSet& xpr, int outer)
+          : MatrixType::InnerIterator(xpr.m_matrix, xpr.m_outerStart + outer)
+        {}
+    };
+
+    inline SparseInnerVectorSet(const MatrixType& matrix, int outerStart, int outerSize)
+      : m_matrix(matrix), m_outerStart(outerStart), m_outerSize(outerSize)
+    {
+      ei_assert( (outerStart>=0) && ((outerStart+outerSize)<=matrix.outerSize()) );
+    }
+
+    inline SparseInnerVectorSet(const MatrixType& matrix, int outer)
+      : m_matrix(matrix), m_outerStart(outer)
+    {
+      ei_assert(Size==1);
+      ei_assert( (outer>=0) && (outer<matrix.outerSize()) );
+    }
+
+    template<typename OtherDerived>
+    inline SparseInnerVectorSet& operator=(const SparseMatrixBase<OtherDerived>& other)
+    {
+      if (IsRowMajor != ((OtherDerived::Flags&RowMajorBit)==RowMajorBit))
+      {
+        // need to transpose => perform a block evaluation followed by a big swap
+        DynamicSparseMatrix<Scalar,IsRowMajor?RowMajorBit:0> aux(other);
+        *this = aux.markAsRValue();
+      }
+      else
+      {
+        // evaluate/copy vector per vector
+        for (int j=0; j<m_outerSize.value(); ++j)
+        {
+          SparseVector<Scalar,IsRowMajor ? RowMajorBit : 0> aux(other.innerVector(j));
+          m_matrix.const_cast_derived()._data()[m_outerStart+j].swap(aux._data());
+        }
+      }
+      return *this;
+    }
+
+    inline SparseInnerVectorSet& operator=(const SparseInnerVectorSet& other)
+    {
+      return operator=<SparseInnerVectorSet>(other);
+    }
+
+    inline const Scalar* _valuePtr() const
+    { return m_matrix._valuePtr() + m_matrix._outerIndexPtr()[m_outerStart]; }
+    inline const int* _innerIndexPtr() const
+    { return m_matrix._innerIndexPtr() + m_matrix._outerIndexPtr()[m_outerStart]; }
+    inline const int* _outerIndexPtr() const
+    { return m_matrix._outerIndexPtr() + m_outerStart; }
+    
+    int nonZeros() const
+    {
+      return  size_t(m_matrix._outerIndexPtr()[m_outerStart+m_outerSize.value()])
+            - size_t(m_matrix._outerIndexPtr()[m_outerStart]); }
+    
+    const Scalar& lastCoeff() const
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(SparseInnerVectorSet);
+      ei_assert(nonZeros()>0);
+      return m_matrix._valuePtr()[m_matrix._outerIndexPtr()[m_outerStart+1]-1];
+    }
+
+//     template<typename Sparse>
+//     inline SparseInnerVectorSet& operator=(const SparseMatrixBase<OtherDerived>& other)
+//     {
+//       return *this;
+//     }
+
+    EIGEN_STRONG_INLINE int rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
+    EIGEN_STRONG_INLINE int cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
+
+  protected:
+
+    const typename MatrixType::Nested m_matrix;
+    int m_outerStart;
+    const ei_int_if_dynamic<Size> m_outerSize;
+
+};
+
+//----------
+
 /** \returns the i-th row of the matrix \c *this. For row-major matrix only. */
 template<typename Derived>
-SparseInnerVector<Derived> SparseMatrixBase<Derived>::row(int i)
+SparseInnerVectorSet<Derived,1> SparseMatrixBase<Derived>::row(int i)
 {
  EIGEN_STATIC_ASSERT(IsRowMajor,THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES);
  return innerVector(i);
 }

-/** \returns the i-th row of the matrix \c *this. For row-major matrix only. 
+/** \returns the i-th row of the matrix \c *this. For row-major matrix only.
  * (read-only version) */
 template<typename Derived>
-const SparseInnerVector<Derived> SparseMatrixBase<Derived>::row(int i) const
+const SparseInnerVectorSet<Derived,1> SparseMatrixBase<Derived>::row(int i) const
 {
  EIGEN_STATIC_ASSERT(IsRowMajor,THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES);
  return innerVector(i);
@@ -97,18 +301,18 @@ const SparseInnerVector<Derived> SparseMatrixBase<Derived>::row(int i) const

 /** \returns the i-th column of the matrix \c *this. For column-major matrix only. */
 template<typename Derived>
-SparseInnerVector<Derived> SparseMatrixBase<Derived>::col(int i)
+SparseInnerVectorSet<Derived,1> SparseMatrixBase<Derived>::col(int i)
 {
-  EIGEN_STATIC_ASSERT(!IsRowMajor,THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES);
+  EIGEN_STATIC_ASSERT(!IsRowMajor,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
  return innerVector(i);
 }

-/** \returns the i-th column of the matrix \c *this. For column-major matrix only. 
+/** \returns the i-th column of the matrix \c *this. For column-major matrix only.
  * (read-only version) */
 template<typename Derived>
-const SparseInnerVector<Derived> SparseMatrixBase<Derived>::col(int i) const
+const SparseInnerVectorSet<Derived,1> SparseMatrixBase<Derived>::col(int i) const
 {
-  EIGEN_STATIC_ASSERT(!IsRowMajor,THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES);
+  EIGEN_STATIC_ASSERT(!IsRowMajor,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
  return innerVector(i);
 }

@@ -116,15 +320,65 @@ const SparseInnerVector<Derived> SparseMatrixBase<Derived>::col(int i) const
  * is col-major (resp. row-major).
  */
 template<typename Derived>
-SparseInnerVector<Derived> SparseMatrixBase<Derived>::innerVector(int outer)
-{ return SparseInnerVector<Derived>(derived(), outer); }
+SparseInnerVectorSet<Derived,1> SparseMatrixBase<Derived>::innerVector(int outer)
+{ return SparseInnerVectorSet<Derived,1>(derived(), outer); }

 /** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
  * is col-major (resp. row-major). Read-only.
  */
 template<typename Derived>
-const SparseInnerVector<Derived> SparseMatrixBase<Derived>::innerVector(int outer) const
-{ return SparseInnerVector<Derived>(derived(), outer); }
+const SparseInnerVectorSet<Derived,1> SparseMatrixBase<Derived>::innerVector(int outer) const
+{ return SparseInnerVectorSet<Derived,1>(derived(), outer); }
+
+//----------
+
+/** \returns the i-th row of the matrix \c *this. For row-major matrix only. */
+template<typename Derived>
+SparseInnerVectorSet<Derived,Dynamic> SparseMatrixBase<Derived>::subrows(int start, int size)
+{
+  EIGEN_STATIC_ASSERT(IsRowMajor,THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES);
+  return innerVectors(start, size);
+}
+
+/** \returns the i-th row of the matrix \c *this. For row-major matrix only.
+  * (read-only version) */
+template<typename Derived>
+const SparseInnerVectorSet<Derived,Dynamic> SparseMatrixBase<Derived>::subrows(int start, int size) const
+{
+  EIGEN_STATIC_ASSERT(IsRowMajor,THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES);
+  return innerVectors(start, size);
+}
+
+/** \returns the i-th column of the matrix \c *this. For column-major matrix only. */
+template<typename Derived>
+SparseInnerVectorSet<Derived,Dynamic> SparseMatrixBase<Derived>::subcols(int start, int size)
+{
+  EIGEN_STATIC_ASSERT(!IsRowMajor,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+  return innerVectors(start, size);
+}
+
+/** \returns the i-th column of the matrix \c *this. For column-major matrix only.
+  * (read-only version) */
+template<typename Derived>
+const SparseInnerVectorSet<Derived,Dynamic> SparseMatrixBase<Derived>::subcols(int start, int size) const
+{
+  EIGEN_STATIC_ASSERT(!IsRowMajor,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+  return innerVectors(start, size);
+}
+
+/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+  * is col-major (resp. row-major).
+  */
+template<typename Derived>
+SparseInnerVectorSet<Derived,Dynamic> SparseMatrixBase<Derived>::innerVectors(int outerStart, int outerSize)
+{ return SparseInnerVectorSet<Derived,Dynamic>(derived(), outerStart, outerSize); }
+
+/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+  * is col-major (resp. row-major). Read-only.
+  */
+template<typename Derived>
+const SparseInnerVectorSet<Derived,Dynamic> SparseMatrixBase<Derived>::innerVectors(int outerStart, int outerSize) const
+{ return SparseInnerVectorSet<Derived,Dynamic>(derived(), outerStart, outerSize); }

 # if 0
 template<typename MatrixType, int BlockRows, int BlockCols, int PacketAccess>
--- a/Eigen/src/Sparse/SparseCwiseBinaryOp.h
+++ b/Eigen/src/Sparse/SparseCwiseBinaryOp.h
@@ -86,6 +86,8 @@ class SparseCwiseBinaryOp : ei_no_assignment_operator,
    EIGEN_STRONG_INLINE SparseCwiseBinaryOp(const Lhs& lhs, const Rhs& rhs, const BinaryOp& func = BinaryOp())
      : m_lhs(lhs), m_rhs(rhs), m_functor(func)
    {
+      EIGEN_STATIC_ASSERT((_LhsNested::Flags&RowMajorBit)==(_RhsNested::Flags&RowMajorBit),
+        BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER)
      EIGEN_STATIC_ASSERT((ei_functor_allows_mixing_real_and_complex<BinaryOp>::ret
                           ? int(ei_is_same_type<typename Lhs::RealScalar, typename Rhs::RealScalar>::ret)
                           : int(ei_is_same_type<typename Lhs::Scalar, typename Rhs::Scalar>::ret)),
@@ -130,11 +132,10 @@ class SparseCwiseBinaryOp<BinaryOp,Lhs,Rhs>::InnerIterator
 * Implementation of inner-iterators
 ***************************************************************************/

-// template<typename T> struct ei_is_scalar_product { enum { ret = false }; };
-// template<typename T> struct ei_is_scalar_product<ei_scalar_product_op<T> > { enum { ret = true }; };
-
-// helper class
+// template<typename T> struct ei_func_is_conjunction { enum { ret = false }; };
+// template<typename T> struct ei_func_is_conjunction<ei_scalar_product_op<T> > { enum { ret = true }; };

+// TODO generalize the ei_scalar_product_op specialization to all conjunctions if any !

 // sparse - sparse  (generic)
 template<typename BinaryOp, typename Lhs, typename Rhs, typename Derived>
@@ -259,12 +260,13 @@ class ei_sparse_cwise_binary_op_inner_iterator_selector<ei_scalar_product_op<T>,
    typedef SparseCwiseBinaryOp<BinaryFunc, Lhs, Rhs> CwiseBinaryXpr;
    typedef typename CwiseBinaryXpr::Scalar Scalar;
    typedef typename ei_traits<CwiseBinaryXpr>::_LhsNested _LhsNested;
+    typedef typename ei_traits<CwiseBinaryXpr>::RhsNested RhsNested;
    typedef typename _LhsNested::InnerIterator LhsIterator;
    enum { IsRowMajor = (int(Lhs::Flags)&RowMajorBit)==RowMajorBit };
  public:

    EIGEN_STRONG_INLINE ei_sparse_cwise_binary_op_inner_iterator_selector(const CwiseBinaryXpr& xpr, int outer)
-      : m_xpr(xpr), m_lhsIter(xpr.lhs(),outer), m_functor(xpr.functor()), m_outer(outer)
+      : m_rhs(xpr.rhs()), m_lhsIter(xpr.lhs(),outer), m_functor(xpr.functor()), m_outer(outer)
    {}

    EIGEN_STRONG_INLINE Derived& operator++()
@@ -275,7 +277,7 @@ class ei_sparse_cwise_binary_op_inner_iterator_selector<ei_scalar_product_op<T>,

    EIGEN_STRONG_INLINE Scalar value() const
    { return m_functor(m_lhsIter.value(),
-                       m_xpr.rhs().coeff(IsRowMajor?m_outer:m_lhsIter.index(),IsRowMajor?m_lhsIter.index():m_outer)); }
+                       m_rhs.coeff(IsRowMajor?m_outer:m_lhsIter.index(),IsRowMajor?m_lhsIter.index():m_outer)); }

    EIGEN_STRONG_INLINE int index() const { return m_lhsIter.index(); }
    EIGEN_STRONG_INLINE int row() const { return m_lhsIter.row(); }
@@ -284,9 +286,9 @@ class ei_sparse_cwise_binary_op_inner_iterator_selector<ei_scalar_product_op<T>,
    EIGEN_STRONG_INLINE operator bool() const { return m_lhsIter; }

  protected:
-    const CwiseBinaryXpr& m_xpr;
+    const RhsNested m_rhs;
    LhsIterator m_lhsIter;
-    const BinaryFunc& m_functor;
+    const BinaryFunc m_functor;
    const int m_outer;
 };

@@ -329,6 +331,10 @@ class ei_sparse_cwise_binary_op_inner_iterator_selector<ei_scalar_product_op<T>,
 };


+/***************************************************************************
+* Implementation of SparseMatrixBase and SparseCwise functions/operators
+***************************************************************************/
+
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_STRONG_INLINE const SparseCwiseBinaryOp<ei_scalar_difference_op<typename ei_traits<Derived>::Scalar>,
--- a/Eigen/src/Sparse/SparseCwiseUnaryOp.h
+++ b/Eigen/src/Sparse/SparseCwiseUnaryOp.h
@@ -89,7 +89,7 @@ class SparseCwiseUnaryOp<UnaryOp,MatrixType>::InnerIterator

  protected:
    MatrixTypeIterator m_iter;
-    const UnaryOp& m_functor;
+    const UnaryOp m_functor;
 };

 template<typename Derived>
--- a/Eigen/src/Sparse/SparseDiagonalProduct.h
+++ b/Eigen/src/Sparse/SparseDiagonalProduct.h
@@ -0,0 +1,157 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_SPARSE_DIAGONAL_PRODUCT_H
+#define EIGEN_SPARSE_DIAGONAL_PRODUCT_H
+
+// the product a diagonal matrix with a sparse matrix can be easily
+// implemented using expression template. We have two very different cases:
+// 1 - diag * row-major sparse
+//     => each inner vector <=> scalar * sparse vector product
+//     => so we can reuse CwiseUnaryOp::InnerIterator
+// 2 - diag * col-major sparse
+//     => each inner vector <=> densevector * sparse vector cwise product
+//     => again, we can reuse specialization of CwiseBinaryOp::InnerIterator
+//        for that particular case
+// The two other cases are symmetric.
+
+template<typename Lhs, typename Rhs>
+struct ei_traits<SparseDiagonalProduct<Lhs, Rhs> > : ei_traits<SparseProduct<Lhs, Rhs, DiagonalProduct> >
+{
+  typedef typename ei_cleantype<Lhs>::type _Lhs;
+  typedef typename ei_cleantype<Rhs>::type _Rhs;
+  enum {
+    SparseFlags = ei_is_diagonal<_Lhs>::ret ? int(_Rhs::Flags) : int(_Lhs::Flags),
+    Flags = SparseBit | (SparseFlags&RowMajorBit)
+  };
+};
+
+enum {SDP_IsDiagonal, SDP_IsSparseRowMajor, SDP_IsSparseColMajor};
+template<typename Lhs, typename Rhs, typename SparseDiagonalProductType, int RhsMode, int LhsMode> 
+class ei_sparse_diagonal_product_inner_iterator_selector;
+
+template<typename LhsNested, typename RhsNested>
+class SparseDiagonalProduct : public SparseMatrixBase<SparseDiagonalProduct<LhsNested,RhsNested> >, ei_no_assignment_operator
+{
+    typedef typename ei_traits<SparseDiagonalProduct>::_LhsNested _LhsNested;
+    typedef typename ei_traits<SparseDiagonalProduct>::_RhsNested _RhsNested;
+    
+    enum {
+      LhsMode = ei_is_diagonal<_LhsNested>::ret ? SDP_IsDiagonal
+              : (_LhsNested::Flags&RowMajorBit) ? SDP_IsSparseRowMajor : SDP_IsSparseColMajor,
+      RhsMode = ei_is_diagonal<_RhsNested>::ret ? SDP_IsDiagonal
+              : (_RhsNested::Flags&RowMajorBit) ? SDP_IsSparseRowMajor : SDP_IsSparseColMajor
+    };
+
+  public:
+
+    EIGEN_SPARSE_GENERIC_PUBLIC_INTERFACE(SparseDiagonalProduct)
+    
+    typedef ei_sparse_diagonal_product_inner_iterator_selector
+                <_LhsNested,_RhsNested,SparseDiagonalProduct,LhsMode,RhsMode> InnerIterator;
+
+    template<typename Lhs, typename Rhs>
+    EIGEN_STRONG_INLINE SparseDiagonalProduct(const Lhs& lhs, const Rhs& rhs)
+      : m_lhs(lhs), m_rhs(rhs)
+    {
+      ei_assert(lhs.cols() == rhs.rows() && "invalid sparse matrix * diagonal matrix product");
+    }
+
+    EIGEN_STRONG_INLINE int rows() const { return m_lhs.rows(); }
+    EIGEN_STRONG_INLINE int cols() const { return m_rhs.cols(); }
+
+    EIGEN_STRONG_INLINE const _LhsNested& lhs() const { return m_lhs; }
+    EIGEN_STRONG_INLINE const _RhsNested& rhs() const { return m_rhs; }
+
+  protected:
+    LhsNested m_lhs;
+    RhsNested m_rhs;
+};
+
+
+template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
+class ei_sparse_diagonal_product_inner_iterator_selector
+<Lhs,Rhs,SparseDiagonalProductType,SDP_IsDiagonal,SDP_IsSparseRowMajor>
+  : public SparseCwiseUnaryOp<ei_scalar_multiple_op<typename Lhs::Scalar>,Rhs>::InnerIterator
+{
+    typedef typename SparseCwiseUnaryOp<ei_scalar_multiple_op<typename Lhs::Scalar>,Rhs>::InnerIterator Base;
+  public:
+    inline ei_sparse_diagonal_product_inner_iterator_selector(
+              const SparseDiagonalProductType& expr, int outer)
+      : Base(expr.rhs()*(expr.lhs().diagonal().coeff(outer)), outer)
+    {}
+};
+
+template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
+class ei_sparse_diagonal_product_inner_iterator_selector
+<Lhs,Rhs,SparseDiagonalProductType,SDP_IsDiagonal,SDP_IsSparseColMajor>
+  : public SparseCwiseBinaryOp<
+      ei_scalar_product_op<typename Lhs::Scalar>,
+      SparseInnerVectorSet<Rhs,1>,
+      typename Lhs::_CoeffsVectorType>::InnerIterator
+{
+    typedef typename SparseCwiseBinaryOp<
+      ei_scalar_product_op<typename Lhs::Scalar>,
+      SparseInnerVectorSet<Rhs,1>,
+      typename Lhs::_CoeffsVectorType>::InnerIterator Base;
+  public:
+    inline ei_sparse_diagonal_product_inner_iterator_selector(
+              const SparseDiagonalProductType& expr, int outer)
+      : Base(expr.rhs().innerVector(outer) .cwise()* expr.lhs().diagonal(), 0)
+    {}
+};
+
+template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
+class ei_sparse_diagonal_product_inner_iterator_selector
+<Lhs,Rhs,SparseDiagonalProductType,SDP_IsSparseColMajor,SDP_IsDiagonal>
+  : public SparseCwiseUnaryOp<ei_scalar_multiple_op<typename Rhs::Scalar>,Lhs>::InnerIterator
+{
+    typedef typename SparseCwiseUnaryOp<ei_scalar_multiple_op<typename Rhs::Scalar>,Lhs>::InnerIterator Base;
+  public:
+    inline ei_sparse_diagonal_product_inner_iterator_selector(
+              const SparseDiagonalProductType& expr, int outer)
+      : Base(expr.lhs()*expr.rhs().diagonal().coeff(outer), outer)
+    {}
+};
+
+template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
+class ei_sparse_diagonal_product_inner_iterator_selector
+<Lhs,Rhs,SparseDiagonalProductType,SDP_IsSparseRowMajor,SDP_IsDiagonal>
+  : public SparseCwiseBinaryOp<
+      ei_scalar_product_op<typename Rhs::Scalar>,
+      SparseInnerVectorSet<Lhs,1>,
+      NestByValue<Transpose<typename Rhs::_CoeffsVectorType> > >::InnerIterator
+{
+    typedef typename SparseCwiseBinaryOp<
+      ei_scalar_product_op<typename Rhs::Scalar>,
+      SparseInnerVectorSet<Lhs,1>,
+      NestByValue<Transpose<typename Rhs::_CoeffsVectorType> > >::InnerIterator Base;
+  public:
+    inline ei_sparse_diagonal_product_inner_iterator_selector(
+              const SparseDiagonalProductType& expr, int outer)
+      : Base(expr.lhs().innerVector(outer) .cwise()* expr.rhs().diagonal().transpose().nestByValue(), 0)
+    {}
+};
+
+#endif // EIGEN_SPARSE_DIAGONAL_PRODUCT_H
--- a/Eigen/src/Sparse/SparseLDLT.h
+++ b/Eigen/src/Sparse/SparseLDLT.h
@@ -79,7 +79,7 @@ class SparseLDLT
  protected:
    typedef typename MatrixType::Scalar Scalar;
    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef SparseMatrix<Scalar,LowerTriangular|UnitDiagBit> CholMatrixType;
+    typedef SparseMatrix<Scalar> CholMatrixType;
    typedef Matrix<Scalar,MatrixType::ColsAtCompileTime,1> VectorType;

    enum {
@@ -137,7 +137,7 @@ class SparseLDLT
      *                              overloads the MemoryEfficient flags)
      *
      * \sa flags() */
-    void settagss(int f) { m_flags = f; }
+    void settags(int f) { m_flags = f; }
    /** \returns the current flags */
    int flags() const { return m_flags; }

@@ -333,12 +333,12 @@ bool SparseLDLT<MatrixType, Backend>::solveInPlace(MatrixBase<Derived> &b) const
    return false;

  if (m_matrix.nonZeros()>0) // otherwise L==I
-    m_matrix.solveTriangularInPlace(b);
+    m_matrix.template triangular<LowerTriangular|UnitDiagBit>().solveInPlace(b);
  b = b.cwise() / m_diag;
  // FIXME should be .adjoint() but it fails to compile...

  if (m_matrix.nonZeros()>0) // otherwise L==I
-  m_matrix.transpose().solveTriangularInPlace(b);
+    m_matrix.transpose().template triangular<UpperTriangular|UnitDiagBit>().solveInPlace(b);

  return true;
 }
--- a/Eigen/src/Sparse/SparseLLT.h
+++ b/Eigen/src/Sparse/SparseLLT.h
@@ -135,7 +135,8 @@ void SparseLLT<MatrixType,Backend>::compute(const MatrixType& a)
  RealScalar density = a.nonZeros()/RealScalar(size*size);

  // TODO estimate the number of non zeros
-  m_matrix.startFill(a.nonZeros()*2);
+  m_matrix.setZero();
+  m_matrix.reserve(a.nonZeros()*2);
  for (int j = 0; j < size; ++j)
  {
    Scalar x = ei_real(a.coeff(j,j));
@@ -147,6 +148,8 @@ void SparseLLT<MatrixType,Backend>::compute(const MatrixType& a)
    // init with current matrix a
    {
      typename MatrixType::InnerIterator it(a,j);
+      ei_assert(it.index()==j &&
+        "matrix must has non zero diagonal entries and only the lower triangular part must be stored");
      ++it; // skip diagonal element
      for (; it; ++it)
        tempVector.coeffRef(it.index()) = it.value();
@@ -171,14 +174,15 @@ void SparseLLT<MatrixType,Backend>::compute(const MatrixType& a)
    // copy the temporary vector to the respective m_matrix.col()
    // while scaling the result by 1/real(x)
    RealScalar rx = ei_sqrt(ei_real(x));
-    m_matrix.fill(j,j) = rx;
+    m_matrix.insert(j,j) = rx; // FIXME use insertBack
    Scalar y = Scalar(1)/rx;
    for (typename AmbiVector<Scalar>::Iterator it(tempVector, m_precision*rx); it; ++it)
    {
-      m_matrix.fill(it.index(), j) = it.value() * y;
+      // FIXME use insertBack
+      m_matrix.insert(it.index(), j) = it.value() * y;
    }
  }
-  m_matrix.endFill();
+  m_matrix.finalize();
 }

 /** Computes b = L^-T L^-1 b */
@@ -189,15 +193,15 @@ bool SparseLLT<MatrixType, Backend>::solveInPlace(MatrixBase<Derived> &b) const
  const int size = m_matrix.rows();
  ei_assert(size==b.rows());

-  m_matrix.solveTriangularInPlace(b);
+  m_matrix.template triangular<LowerTriangular>().solveInPlace(b);
  // FIXME should be simply .adjoint() but it fails to compile...
  if (NumTraits<Scalar>::IsComplex)
  {
    CholMatrixType aux = m_matrix.conjugate();
-    aux.transpose().solveTriangularInPlace(b);
+    aux.transpose().template triangular<UpperTriangular>().solveInPlace(b);
  }
  else
-    m_matrix.transpose().solveTriangularInPlace(b);
+    m_matrix.transpose().template triangular<UpperTriangular>().solveInPlace(b);

  return true;
 }
--- a/Eigen/src/Sparse/SparseLU.h
+++ b/Eigen/src/Sparse/SparseLU.h
@@ -25,6 +25,12 @@
 #ifndef EIGEN_SPARSELU_H
 #define EIGEN_SPARSELU_H

+enum {
+    SvNoTrans   = 0,
+    SvTranspose = 1,
+    SvAdjoint   = 2
+};
+
 /** \ingroup Sparse_Module
  *
  * \class SparseLU
@@ -115,7 +121,8 @@ class SparseLU
    //inline const MatrixType& matrixU() const { return m_matrixU; }

    template<typename BDerived, typename XDerived>
-    bool solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>* x) const;
+    bool solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>* x,
+               const int transposed = SvNoTrans) const;

    /** \returns true if the factorization succeeded */
    inline bool succeeded(void) const { return m_succeeded; }
@@ -136,10 +143,17 @@ void SparseLU<MatrixType,Backend>::compute(const MatrixType& a)
  ei_assert(false && "not implemented yet");
 }

-/** Computes *x = U^-1 L^-1 b */
+/** Computes *x = U^-1 L^-1 b
+  *
+  * If \a transpose is set to SvTranspose or SvAdjoint, the solution
+  * of the transposed/adjoint system is computed instead.
+  *
+  * Not all backends implement the solution of the transposed or
+  * adjoint system.
+  */
 template<typename MatrixType, int Backend>
 template<typename BDerived, typename XDerived>
-bool SparseLU<MatrixType,Backend>::solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>* x) const
+bool SparseLU<MatrixType,Backend>::solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>* x, const int transposed) const
 {
  ei_assert(false && "not implemented yet");
  return false;
--- a/Eigen/src/Sparse/SparseMatrix.h
+++ b/Eigen/src/Sparse/SparseMatrix.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra. Eigen itself is part of the KDE project.
 //
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
 //
 // Eigen is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
@@ -62,7 +62,7 @@ class SparseMatrix
    // FIXME: why are these operator already alvailable ???
    // EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(SparseMatrix, *=)
    // EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(SparseMatrix, /=)
-    
+
    typedef MappedSparseMatrix<Scalar,Flags> Map;

  protected:
@@ -79,7 +79,7 @@ class SparseMatrix

    inline int rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
    inline int cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
-    
+
    inline int innerSize() const { return m_innerSize; }
    inline int outerSize() const { return m_outerSize; }
    inline int innerNonZeros(int j) const { return m_outerIndex[j+1]-m_outerIndex[j]; }
@@ -118,34 +118,36 @@ class SparseMatrix

    class InnerIterator;

+    /** Removes all non zeros */
    inline void setZero()
    {
      m_data.clear();
-      //if (m_outerSize)
      memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(int));
-//       for (int i=0; i<m_outerSize; ++i)
-//         m_outerIndex[i] = 0;
-//       if (m_outerSize)
-//         m_outerIndex[i] = 0;
    }

    /** \returns the number of non zero coefficients */
    inline int nonZeros() const  { return m_data.size(); }

-    /** Initializes the filling process of \c *this.
+    /** \deprecated use setZero() and reserve()
+      * Initializes the filling process of \c *this.
      * \param reserveSize approximate number of nonzeros
      * Note that the matrix \c *this is zero-ed.
      */
-    inline void startFill(int reserveSize = 1000)
+    EIGEN_DEPRECATED void startFill(int reserveSize = 1000)
    {
-//       std::cerr << this << " startFill\n";
      setZero();
      m_data.reserve(reserveSize);
    }

-    /**
+    /** Preallocates \a reserveSize non zeros */
+    inline void reserve(int reserveSize)
+    {
+      m_data.reserve(reserveSize);
+    }
+
+    /** \deprecated use insert()
      */
-    inline Scalar& fill(int row, int col)
+    EIGEN_DEPRECATED Scalar& fill(int row, int col)
    {
      const int outer = IsRowMajor ? row : col;
      const int inner = IsRowMajor ? col : row;
@@ -161,6 +163,11 @@ class SparseMatrix
        }
        m_outerIndex[outer+1] = m_outerIndex[outer];
      }
+      else
+      {
+        ei_assert(m_data.index(m_data.size()-1)<inner && "wrong sorted insertion");
+      }
+//       std::cerr << size_t(m_outerIndex[outer+1]) << " == " << m_data.size() << "\n";
      assert(size_t(m_outerIndex[outer+1]) == m_data.size());
      int id = m_outerIndex[outer+1];
      ++m_outerIndex[outer+1];
@@ -169,44 +176,135 @@ class SparseMatrix
      return m_data.value(id);
    }

-    /** Like fill() but with random inner coordinates.
+    //--- low level purely coherent filling ---
+
+    inline Scalar& insertBack(int outer, int inner)
+    {
+      ei_assert(size_t(m_outerIndex[outer+1]) == m_data.size() && "wrong sorted insertion");
+      ei_assert( (m_outerIndex[outer+1]-m_outerIndex[outer]==0 || m_data.index(m_data.size()-1)<inner) && "wrong sorted insertion");
+      int id = m_outerIndex[outer+1];
+      ++m_outerIndex[outer+1];
+      m_data.append(0, inner);
+      return m_data.value(id);
+    }
+
+    inline void startVec(int outer)
+    {
+      ei_assert(m_outerIndex[outer]==int(m_data.size()) && "you must call startVec on each inner vec");
+      ei_assert(m_outerIndex[outer+1]==0 && "you must call startVec on each inner vec");
+      m_outerIndex[outer+1] = m_outerIndex[outer];
+    }
+
+    //---
+
+    /** \deprecated use insert()
+      * Like fill() but with random inner coordinates.
      */
-    inline Scalar& fillrand(int row, int col)
+    EIGEN_DEPRECATED Scalar& fillrand(int row, int col)
+    {
+      return insert(row,col);
+    }
+
+    /** \returns a reference to a novel non zero coefficient with coordinates \a row x \a col.
+      * The non zero coefficient must \b not already exist.
+      *
+      * \warning This function can be extremely slow if the non zero coefficients
+      * are not inserted in a coherent order.
+      *
+      * After an insertion session, you should call the finalize() function.
+      */
+    EIGEN_DONT_INLINE Scalar& insert(int row, int col)
    {
      const int outer = IsRowMajor ? row : col;
      const int inner = IsRowMajor ? col : row;
+
+      int previousOuter = outer;
      if (m_outerIndex[outer+1]==0)
      {
        // we start a new inner vector
-        // nothing special to do here
-        int i = outer;
-        while (i>=0 && m_outerIndex[i]==0)
+        while (previousOuter>=0 && m_outerIndex[previousOuter]==0)
        {
-          m_outerIndex[i] = m_data.size();
-          --i;
+          m_outerIndex[previousOuter] = m_data.size();
+          --previousOuter;
        }
        m_outerIndex[outer+1] = m_outerIndex[outer];
      }
-      assert(size_t(m_outerIndex[outer+1]) == m_data.size() && "invalid outer index");
+
+      // here we have to handle the tricky case where the outerIndex array
+      // starts with: [ 0 0 0 0 0 1 ...] and we are inserting in, e.g.,
+      // the 2nd inner vector...
+      bool isLastVec = (!(previousOuter==-1 && m_data.size()!=0))
+                    && (size_t(m_outerIndex[outer+1]) == m_data.size());
+
      size_t startId = m_outerIndex[outer];
      // FIXME let's make sure sizeof(long int) == sizeof(size_t)
      size_t id = m_outerIndex[outer+1];
      ++m_outerIndex[outer+1];
-      
+
      float reallocRatio = 1;
-      if (m_data.allocatedSize()<id+1)
+      if (m_data.allocatedSize()<=m_data.size())
      {
-        // we need to reallocate the data, to reduce multiple reallocations
-        // we use a smart resize algorithm based on the current filling ratio
-        // we use float to avoid overflows
-        float nnzEstimate = float(m_outerIndex[outer])*float(m_outerSize)/float(outer);
-        reallocRatio = (nnzEstimate-float(m_data.size()))/float(m_data.size());
-        // let's bounds the realloc ratio to
-        //   1) reduce multiple minor realloc when the matrix is almost filled
-        //   2) avoid to allocate too much memory when the matrix is almost empty
-        reallocRatio = std::min(std::max(reallocRatio,1.5f),8.f);
+        // if there is no preallocated memory, let's reserve a minimum of 32 elements
+        if (m_data.size()==0)
+        {
+          m_data.reserve(32);
+        }
+        else
+        {
+          // we need to reallocate the data, to reduce multiple reallocations
+          // we use a smart resize algorithm based on the current filling ratio
+          // in addition, we use float to avoid integers overflows
+          float nnzEstimate = float(m_outerIndex[outer])*float(m_outerSize)/float(outer+1);
+          reallocRatio = (nnzEstimate-float(m_data.size()))/float(m_data.size());
+          // furthermore we bound the realloc ratio to:
+          //   1) reduce multiple minor realloc when the matrix is almost filled
+          //   2) avoid to allocate too much memory when the matrix is almost empty
+          reallocRatio = std::min(std::max(reallocRatio,1.5f),8.f);
+        }
+      }
+      m_data.resize(m_data.size()+1,reallocRatio);
+
+      if (!isLastVec)
+      {
+        if (previousOuter==-1)
+        {
+          // oops wrong guess.
+          // let's correct the outer offsets
+          for (int k=0; k<=(outer+1); ++k)
+            m_outerIndex[k] = 0;
+          int k=outer+1;
+          while(m_outerIndex[k]==0)
+            m_outerIndex[k++] = 1;
+          while (k<=m_outerSize && m_outerIndex[k]!=0)
+            m_outerIndex[k++]++;
+          id = 0;
+          --k;
+          k = m_outerIndex[k]-1;
+          while (k>0)
+          {
+            m_data.index(k) = m_data.index(k-1);
+            m_data.value(k) = m_data.value(k-1);
+            k--;
+          }
+        }
+        else
+        {
+          // we are not inserting into the last inner vec
+          // update outer indices:
+          int j = outer+2;
+          while (j<=m_outerSize && m_outerIndex[j]!=0)
+            m_outerIndex[j++]++;
+          --j;
+          // shift data of last vecs:
+          int k = m_outerIndex[j]-1;
+          while (k>=int(id))
+          {
+            m_data.index(k) = m_data.index(k-1);
+            m_data.value(k) = m_data.value(k-1);
+            k--;
+          }
+        }
      }
-      m_data.resize(id+1,reallocRatio);

      while ( (id > startId) && (m_data.index(id-1) > inner) )
      {
@@ -214,12 +312,16 @@ class SparseMatrix
        m_data.value(id) = m_data.value(id-1);
        --id;
      }
-      
+
      m_data.index(id) = inner;
      return (m_data.value(id) = 0);
    }

-    inline void endFill()
+    EIGEN_DEPRECATED void endFill() { finalize(); }
+
+    /** Must be called after inserting a set of non zero entries.
+      */
+    inline void finalize()
    {
      int size = m_data.size();
      int i = m_outerSize;
@@ -233,7 +335,7 @@ class SparseMatrix
        ++i;
      }
    }
-    
+
    void prune(Scalar reference, RealScalar epsilon = precision<RealScalar>())
    {
      int k = 0;
@@ -390,11 +492,11 @@ class SparseMatrix
        s << std::endl;
        s << std::endl;
        s << "Column pointers:\n";
-        for (int i=0; i<m.cols(); ++i)
+        for (int i=0; i<m.outerSize(); ++i)
        {
          s << m.m_outerIndex[i] << " ";
        }
-        s << std::endl;
+        s << " $" << std::endl;
        s << std::endl;
      );
      s << static_cast<const SparseMatrixBase<SparseMatrix>&>(m);
--- a/Show More
+++ b/Show More