From 4fa40367e9bf55ea8b2ad1040b3fb73f94e4f12f Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 9 Aug 2008 18:41:24 +0000
Subject: [PATCH] * Big change in Block and Map:   - added a MapBase base xpr
 on top of which Map and the specialization     of Block are implemented   -
 MapBase forces both aligned loads (and aligned stores, see below) in
 expressions     such as "x.block(...) += other_expr" * Significant
 vectorization improvement:  - added a AlignedBit flag meaning the first
 coeff/packet is aligned,    this allows to not generate extra code to deal
 with the first unaligned part  - removed all unaligned stores when no
 unrolling  - removed unaligned loads in Sum when the input as the
 DirectAccessBit flag * Some code simplification in CacheFriendly product *
 Some minor documentation improvements

---
 Eigen/Core                                |   3 +-
 Eigen/src/Core/Assign.h                   |  69 ++++++---
 Eigen/src/Core/Block.h                    | 145 ++++++-------------
 Eigen/src/Core/CacheFriendlyProduct.h     |  42 ++----
 Eigen/src/Core/Coeffs.h                   |  12 +-
 Eigen/src/Core/CwiseBinaryOp.h            |   2 +-
 Eigen/src/Core/CwiseUnaryOp.h             |   2 +-
 Eigen/src/Core/Dot.h                      |  16 ++-
 Eigen/src/Core/Map.h                      | 117 +++------------
 Eigen/src/Core/MapBase.h                  | 167 ++++++++++++++++++++++
 Eigen/src/Core/MatrixBase.h               |   4 +-
 Eigen/src/Core/Sum.h                      |  36 +++--
 Eigen/src/Core/Swap.h                     |  30 ++--
 Eigen/src/Core/util/Constants.h           |  27 ++--
 Eigen/src/Core/util/ForwardDeclarations.h |   6 +-
 Eigen/src/Core/util/Meta.h                |   6 +-
 Eigen/src/Geometry/Quaternion.h           |   9 +-
 17 files changed, 397 insertions(+), 296 deletions(-)
 create mode 100644 Eigen/src/Core/MapBase.h
diff --git a/Eigen/Core b/Eigen/Core
index e671f7879..db2fde30d 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -45,6 +45,8 @@ namespace Eigen {
 #include "src/Core/Product.h"
 #include "src/Core/DiagonalProduct.h"
 #include "src/Core/InverseProduct.h"
+#include "src/Core/MapBase.h"
+#include "src/Core/Map.h"
 #include "src/Core/Block.h"
 #include "src/Core/Minor.h"
 #include "src/Core/Transpose.h"
@@ -54,7 +56,6 @@ namespace Eigen {
 #include "src/Core/Redux.h"
 #include "src/Core/Visitor.h"
 #include "src/Core/Fuzzy.h"
-#include "src/Core/Map.h"
 #include "src/Core/IO.h"
 #include "src/Core/Swap.h"
 #include "src/Core/CommaInitializer.h"
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index d744a15a4..5ea59e3db 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -34,6 +34,13 @@
 template <typename Derived, typename OtherDerived>
 struct ei_assign_traits
 {
+public:
+  enum {
+    DstIsAligned = Derived::Flags & AlignedBit,
+    SrcIsAligned = OtherDerived::Flags & AlignedBit,
+    SrcAlignment = DstIsAligned && SrcIsAligned ? Aligned : Unaligned
+  };
+
 private:
   enum {
     InnerSize = int(Derived::Flags)&RowMajorBit
@@ -48,7 +55,8 @@ private:
   enum {
     MightVectorize = (int(Derived::Flags) & int(OtherDerived::Flags) & ActualPacketAccessBit)
                   && ((int(Derived::Flags)&RowMajorBit)==(int(OtherDerived::Flags)&RowMajorBit)),
-    MayInnerVectorize  = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0,
+    MayInnerVectorize  = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
+                       && int(DstIsAligned) && int(SrcIsAligned),
     MayLinearVectorize = MightVectorize && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
     MaySliceVectorize  = MightVectorize && int(InnerMaxSize)==Dynamic /* slice vectorization can be slow, so we only
       want it if the slices are big, which is indicated by InnerMaxSize rather than InnerSize, think of the case
@@ -79,7 +87,7 @@ public:
                                             : int(NoUnrolling)
                 )
               : int(Vectorization) == int(LinearVectorization)
-              ? ( int(MayUnrollCompletely) ? int(CompleteUnrolling) : int(NoUnrolling) )
+              ? ( int(MayUnrollCompletely) && int(DstIsAligned) ? int(CompleteUnrolling) : int(NoUnrolling) )
               : int(NoUnrolling)
   };
 };
@@ -154,7 +162,7 @@ struct ei_assign_innervec_CompleteUnrolling
 
   inline static void run(Derived1 &dst, const Derived2 &src)
   {
-    dst.template copyPacket<Derived2, Aligned>(row, col, src);
+    dst.template copyPacket<Derived2, Aligned, Aligned>(row, col, src);
     ei_assign_innervec_CompleteUnrolling<Derived1, Derived2,
       Index+ei_packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src);
   }
@@ -173,7 +181,7 @@ struct ei_assign_innervec_InnerUnrolling
   {
     const int row = int(Derived1::Flags)&RowMajorBit ? row_or_col : Index;
     const int col = int(Derived1::Flags)&RowMajorBit ? Index : row_or_col;
-    dst.template copyPacket<Derived2, Aligned>(row, col, src);
+    dst.template copyPacket<Derived2, Aligned, Aligned>(row, col, src);
     ei_assign_innervec_InnerUnrolling<Derived1, Derived2,
       Index+ei_packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src, row_or_col);
   }
@@ -256,9 +264,9 @@ struct ei_assign_impl<Derived1, Derived2, InnerVectorization, NoUnrolling>
       for(int i = 0; i < innerSize; i+=packetSize)
       {
         if(int(Derived1::Flags)&RowMajorBit)
-          dst.template copyPacket<Derived2, Aligned>(j, i, src);
+          dst.template copyPacket<Derived2, Aligned, Aligned>(j, i, src);
         else
-          dst.template copyPacket<Derived2, Aligned>(i, j, src);
+          dst.template copyPacket<Derived2, Aligned, Aligned>(i, j, src);
       }
   }
 };
@@ -298,14 +306,19 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
   {
     const int size = dst.size();
     const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
-    const int alignedSize = (size/packetSize)*packetSize;
+    const int alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0
+                           : ei_alignmentOffset(&dst.coeffRef(0), size);
+    const int alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
 
-    for(int index = 0; index < alignedSize; index += packetSize)
+    for(int index = 0; index < alignedStart; index++)
+      dst.copyCoeff(index, src);
+
+    for(int index = alignedStart; index < alignedEnd; index += packetSize)
     {
-      dst.template copyPacket<Derived2, Aligned>(index, src);
+      dst.template copyPacket<Derived2, Aligned, ei_assign_traits<Derived1,Derived2>::SrcAlignment>(index, src);
     }
 
-    for(int index = alignedSize; index < size; index++)
+    for(int index = alignedEnd; index < size; index++)
       dst.copyCoeff(index, src);
   }
 };
@@ -334,29 +347,45 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorization, NoUnrolling>
   static void run(Derived1 &dst, const Derived2 &src)
   {
     const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
+    const int packetAlignedMask = packetSize - 1;
     const int innerSize = dst.innerSize();
     const int outerSize = dst.outerSize();
-    const int alignedInnerSize = (innerSize/packetSize)*packetSize;
+    const int alignedStep = (packetSize - dst.stride() % packetSize) & packetAlignedMask;
+    int alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0
+                     : ei_alignmentOffset(&dst.coeffRef(0), innerSize);
 
     for(int i = 0; i < outerSize; i++)
     {
-      // do the vectorizable part of the assignment
-      for (int index = 0; index<alignedInnerSize ; index+=packetSize)
-      {
-        if(Derived1::Flags&RowMajorBit)
-          dst.template copyPacket<Derived2, Unaligned>(i, index, src);
-        else
-          dst.template copyPacket<Derived2, Unaligned>(index, i, src);
-      }
+      const int alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask);
 
       // do the non-vectorizable part of the assignment
-      for (int index = alignedInnerSize; index<innerSize ; index++)
+      for (int index = 0; index<alignedStart ; index++)
       {
         if(Derived1::Flags&RowMajorBit)
           dst.copyCoeff(i, index, src);
         else
           dst.copyCoeff(index, i, src);
       }
+
+      // do the vectorizable part of the assignment
+      for (int index = alignedStart; index<alignedEnd; index+=packetSize)
+      {
+        if(Derived1::Flags&RowMajorBit)
+          dst.template copyPacket<Derived2, Aligned, Unaligned>(i, index, src);
+        else
+          dst.template copyPacket<Derived2, Aligned, Unaligned>(index, i, src);
+      }
+
+      // do the non-vectorizable part of the assignment
+      for (int index = alignedEnd; index<innerSize ; index++)
+      {
+        if(Derived1::Flags&RowMajorBit)
+          dst.copyCoeff(i, index, src);
+        else
+          dst.copyCoeff(index, i, src);
+      }
+
+      alignedStart = (alignedStart+alignedStep)%packetSize;
     }
   }
 };
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index c3174073b..c2c9606a5 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -33,6 +33,8 @@
   * \param MatrixType the type of the object in which we are taking a block
   * \param BlockRows the number of rows of the block we are taking at compile time (optional)
   * \param BlockCols the number of columns of the block we are taking at compile time (optional)
+  * \param _PacketAccess 
+  * \param _DirectAccessStatus \internal used for partial specialization
   *
   * This class represents an expression of either a fixed-size or dynamic-size block. It is the return
   * type of MatrixBase::block(int,int,int,int) and MatrixBase::block<int,int>(int,int) and
@@ -56,8 +58,8 @@
   *
   * \sa MatrixBase::block(int,int,int,int), MatrixBase::block(int,int), class VectorBlock
   */
-template<typename MatrixType, int BlockRows, int BlockCols, int DirectAccesStatus>
-struct ei_traits<Block<MatrixType, BlockRows, BlockCols, DirectAccesStatus> >
+template<typename MatrixType, int BlockRows, int BlockCols, int _PacketAccess, int _DirectAccessStatus>
+struct ei_traits<Block<MatrixType, BlockRows, BlockCols, _PacketAccess, _DirectAccessStatus> >
 {
   typedef typename MatrixType::Scalar Scalar;
   enum{
@@ -74,17 +76,21 @@ struct ei_traits<Block<MatrixType, BlockRows, BlockCols, DirectAccesStatus> >
     RowMajor = int(MatrixType::Flags)&RowMajorBit,
     InnerSize = RowMajor ? ColsAtCompileTime : RowsAtCompileTime,
     InnerMaxSize = RowMajor ? MaxColsAtCompileTime : MaxRowsAtCompileTime,
-    MaskPacketAccessBit = (InnerMaxSize == Dynamic || (InnerSize % ei_packet_traits<Scalar>::size) == 0)
+    MaskPacketAccessBit = (InnerMaxSize == Dynamic || (InnerSize >= ei_packet_traits<Scalar>::size))
                         ? PacketAccessBit : 0,
     FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
     Flags = (MatrixType::Flags & (HereditaryBits | MaskPacketAccessBit | DirectAccessBit) & MaskLargeBit)
           | FlagsLinearAccessBit,
-    CoeffReadCost = MatrixType::CoeffReadCost
+    CoeffReadCost = MatrixType::CoeffReadCost,
+    PacketAccess = _PacketAccess
   };
+  typedef typename ei_meta_if<int(PacketAccess)==Aligned,
+                 Block<MatrixType, BlockRows, BlockCols, _PacketAccess, _DirectAccessStatus>&,
+                 Block<MatrixType, BlockRows, BlockCols, Aligned, _DirectAccessStatus> >::ret AlignedDerivedType;
 };
 
-template<typename MatrixType, int BlockRows, int BlockCols, int DirectAccesStatus> class Block
-  : public MatrixBase<Block<MatrixType, BlockRows, BlockCols, DirectAccesStatus> >
+template<typename MatrixType, int BlockRows, int BlockCols, int PacketAccess, int _DirectAccessStatus> class Block
+  : public MatrixBase<Block<MatrixType, BlockRows, BlockCols, PacketAccess, _DirectAccessStatus> >
 {
   public:
 
@@ -205,26 +211,36 @@ template<typename MatrixType, int BlockRows, int BlockCols, int DirectAccesStatu
 };
 
 /** \internal */
-template<typename MatrixType, int BlockRows, int BlockCols> class Block<MatrixType,BlockRows,BlockCols,HasDirectAccess>
-  : public MatrixBase<Block<MatrixType, BlockRows, BlockCols,HasDirectAccess> >
+template<typename MatrixType, int BlockRows, int BlockCols, int PacketAccess>
+class Block<MatrixType,BlockRows,BlockCols,PacketAccess,HasDirectAccess>
+  : public MapBase<Block<MatrixType, BlockRows, BlockCols,PacketAccess,HasDirectAccess> >
 {
-    enum {
-      IsRowMajor = int(ei_traits<MatrixType>::Flags)&RowMajorBit ? 1 : 0
-    };
-
   public:
 
-    EIGEN_GENERIC_PUBLIC_INTERFACE(Block)
+    _EIGEN_GENERIC_PUBLIC_INTERFACE(Block, MapBase<Block>)
+
+    typedef typename ei_traits<Block>::AlignedDerivedType AlignedDerivedType;
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block)
+
+    AlignedDerivedType allowAligned()
+    {
+      if (PacketAccess==Aligned)
+        return *this;
+      else
+        return Block<MatrixType,BlockRows,BlockCols,Aligned,HasDirectAccess>
+                    (m_matrix, Base::m_data, Base::m_rows.value(), Base::m_cols.value());
+    }
 
     /** Column or Row constructor
       */
     inline Block(const MatrixType& matrix, int i)
-      : m_matrix(matrix),
-        m_data_ptr(&matrix.const_cast_derived().coeffRef(
-          (BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) ? i : 0,
-          (BlockRows==MatrixType::RowsAtCompileTime) && (BlockCols==1) ? i : 0)),
-        m_blockRows(matrix.rows()),
-        m_blockCols(matrix.cols())
+      : Base(&matrix.const_cast_derived().coeffRef(
+              (BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) ? i : 0,
+              (BlockRows==MatrixType::RowsAtCompileTime) && (BlockCols==1) ? i : 0),
+             BlockRows==1 ? 1 : matrix.rows(),
+             BlockCols==1 ? 1 : matrix.cols()),
+        m_matrix(matrix)
     {
       ei_assert( (i>=0) && (
           ((BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) && i<matrix.rows())
@@ -234,13 +250,10 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block<MatrixTy
     /** Fixed-size constructor
       */
     inline Block(const MatrixType& matrix, int startRow, int startCol)
-      : m_matrix(matrix), m_data_ptr(&matrix.const_cast_derived().coeffRef(startRow,startCol)),
-        m_blockRows(matrix.rows()), m_blockCols(matrix.cols())
+      : Base(&matrix.const_cast_derived().coeffRef(startRow,startCol)), m_matrix(matrix)
     {
-      EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && RowsAtCompileTime!=Dynamic,this_method_is_only_for_fixed_size);
-      ei_assert(RowsAtCompileTime!=Dynamic && RowsAtCompileTime!=Dynamic);
       ei_assert(startRow >= 0 && BlockRows >= 1 && startRow + BlockRows <= matrix.rows()
-          && startCol >= 0 && BlockCols >= 1 && startCol + BlockCols <= matrix.cols());
+             && startCol >= 0 && BlockCols >= 1 && startCol + BlockCols <= matrix.cols());
     }
 
     /** Dynamic-size constructor
@@ -248,91 +261,25 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block<MatrixTy
     inline Block(const MatrixType& matrix,
           int startRow, int startCol,
           int blockRows, int blockCols)
-      : m_matrix(matrix), m_data_ptr(&matrix.const_cast_derived().coeffRef(startRow,startCol)),
-        m_blockRows(blockRows), m_blockCols(blockCols)
+      : Base(&matrix.const_cast_derived().coeffRef(startRow,startCol), blockRows, blockCols),
+        m_matrix(matrix)
     {
       ei_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==blockRows)
-          && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==blockCols));
+             && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==blockCols));
       ei_assert(startRow >= 0 && blockRows >= 1 && startRow + blockRows <= matrix.rows()
-          && startCol >= 0 && blockCols >= 1 && startCol + blockCols <= matrix.cols());
+             && startCol >= 0 && blockCols >= 1 && startCol + blockCols <= matrix.cols());
     }
 
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block)
-
-    inline int rows() const { return m_blockRows.value(); }
-    inline int cols() const { return m_blockCols.value(); }
-
     inline int stride(void) const { return m_matrix.stride(); }
 
-    inline Scalar& coeffRef(int row, int col)
-    {
-      if (IsRowMajor)
-        return m_data_ptr[col + row * stride()];
-      else
-        return m_data_ptr[row + col * stride()];
-    }
-
-    inline const Scalar coeff(int row, int col) const
-    {
-      if (IsRowMajor)
-        return m_data_ptr[col + row * stride()];
-      else
-        return m_data_ptr[row + col * stride()];
-    }
-
-    inline Scalar& coeffRef(int index)
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block);
-      return m_data_ptr[index];
-    }
-
-    inline const Scalar coeff(int index) const
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block);
-      if ( (RowsAtCompileTime == 1) == IsRowMajor )
-        return m_data_ptr[index];
-      else
-        return m_data_ptr[index*stride()];
-    }
-
-    template<int LoadMode>
-    inline PacketScalar packet(int row, int col) const
-    {
-      if (IsRowMajor)
-        return ei_ploadu(&m_data_ptr[col + row * stride()]);
-      else
-        return ei_ploadu(&m_data_ptr[row + col * stride()]);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(int row, int col, const PacketScalar& x)
-    {
-      if (IsRowMajor)
-        ei_pstoreu(&m_data_ptr[col + row * stride()], x);
-      else
-        ei_pstoreu(&m_data_ptr[row + col * stride()], x);
-    }
-
-    template<int LoadMode>
-    inline PacketScalar packet(int index) const
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block);
-      return ei_ploadu(&m_data_ptr[index]);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(int index, const PacketScalar& x)
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block);
-      ei_pstoreu(&m_data_ptr[index], x);
-    }
-
   protected:
 
+    /** \internal used by allowAligned() */
+    inline Block(const MatrixType& matrix, const Scalar* data, int blockRows, int blockCols)
+      : Base(data, blockRows, blockCols), m_matrix(matrix)
+    {}
+
     const typename MatrixType::Nested m_matrix;
-    Scalar* m_data_ptr;
-    const ei_int_if_dynamic<RowsAtCompileTime> m_blockRows;
-    const ei_int_if_dynamic<ColsAtCompileTime> m_blockCols;
 };
 
 /** \returns a dynamic-size expression of a block in *this.
diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h
index 9d4b0af36..bd9f4d0d9 100644
--- a/Eigen/src/Core/CacheFriendlyProduct.h
+++ b/Eigen/src/Core/CacheFriendlyProduct.h
@@ -419,16 +419,19 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
 
     ei_internal_assert((alignmentPattern==NoneAligned) || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0);
   }
+
+  int offset1 = (FirstAligned && alignmentStep==1?3:1);
+  int offset3 = (FirstAligned && alignmentStep==1?1:3);
   
   int columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
   for (int i=skipColumns; i<columnBound; i+=columnsAtOnce)
   {
-    Packet ptmp0 = ei_pset1(rhs[i]),   ptmp1 = ei_pset1(rhs[i+1]),
-           ptmp2 = ei_pset1(rhs[i+2]), ptmp3 = ei_pset1(rhs[i+3]);
+    Packet ptmp0 = ei_pset1(rhs[i]),   ptmp1 = ei_pset1(rhs[i+offset1]),
+           ptmp2 = ei_pset1(rhs[i+2]), ptmp3 = ei_pset1(rhs[i+offset3]);
 
     // this helps a lot generating better binary code
-    const Scalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+1)*lhsStride,
-                 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+3)*lhsStride;
+    const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
+                 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
 
     if (PacketSize>1)
     {
@@ -453,17 +456,11 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
             if(peels>1)
             {
               Packet A00, A01, A02, A03, A10, A11, A12, A13;
-              if (alignmentStep==1)
-              {
-                A00 = ptmp1; ptmp1 = ptmp3; ptmp3 = A00;
-                const Scalar* aux = lhs1;
-                lhs1 = lhs3; lhs3 = aux;
-              }
 
               A01 = ei_pload(&lhs1[alignedStart-1]);
               A02 = ei_pload(&lhs2[alignedStart-2]);
               A03 = ei_pload(&lhs3[alignedStart-3]);
-                
+
               for (int j = alignedStart; j<peeledSize; j+=peels*PacketSize)
               {
                 A11 = ei_pload(&lhs1[j-1+PacketSize]);  ei_palign<1>(A01,A11);
@@ -613,6 +610,9 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
     ei_internal_assert((alignmentPattern==NoneAligned) || PacketSize==1
       || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0);
   }
+
+  int offset1 = (FirstAligned && alignmentStep==1?3:1);
+  int offset3 = (FirstAligned && alignmentStep==1?1:3);
   
   int rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
   for (int i=skipRows; i<rowBound; i+=rowsAtOnce)
@@ -620,8 +620,8 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
     Scalar tmp0 = Scalar(0), tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0);
 
     // this helps the compiler generating good binary code
-    const Scalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+1)*lhsStride,
-                 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+3)*lhsStride;
+    const Scalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
+                 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
 
     if (PacketSize>1)
     {
@@ -658,13 +658,6 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
                * than basic unaligned loads.
                */
               Packet A01, A02, A03, b, A11, A12, A13;
-              if (alignmentStep==1)
-              {
-                // flip row #1 and #3
-                b = ptmp1; ptmp1 = ptmp3; ptmp3 = b;
-                const Scalar* aux = lhs1;
-                lhs1 = lhs3; lhs3 = aux;
-              }
               A01 = ei_pload(&lhs1[alignedStart-1]);
               A02 = ei_pload(&lhs2[alignedStart-2]);
               A03 = ei_pload(&lhs3[alignedStart-3]);
@@ -690,13 +683,6 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
                 ptmp2 = ei_pmadd(b, A12, ptmp2);
                 ptmp3 = ei_pmadd(b, A13, ptmp3);
               }
-              if (alignmentStep==1)
-              {
-                // restore rows #1 and #3
-                b = ptmp1; ptmp1 = ptmp3; ptmp3 = b;
-                const Scalar* aux = lhs1;
-                lhs1 = lhs3; lhs3 = aux;
-              }
             }
             for (int j = peeledSize; j<alignedSize; j+=PacketSize)
               _EIGEN_ACCUMULATE_PACKETS(,u,u,);
@@ -720,7 +706,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
       Scalar b = rhs[j];
       tmp0 += b*lhs0[j]; tmp1 += b*lhs1[j]; tmp2 += b*lhs2[j]; tmp3 += b*lhs3[j];
     }
-    res[i] += tmp0; res[i+1] += tmp1; res[i+2] += tmp2; res[i+3] += tmp3;
+    res[i] += tmp0; res[i+offset1] += tmp1; res[i+2] += tmp2; res[i+offset3] += tmp3;
   }
 
   // process remaining first and last rows (at most columnsAtOnce-1)
diff --git a/Eigen/src/Core/Coeffs.h b/Eigen/src/Core/Coeffs.h
index e21083664..8836998be 100644
--- a/Eigen/src/Core/Coeffs.h
+++ b/Eigen/src/Core/Coeffs.h
@@ -298,22 +298,22 @@ inline void MatrixBase<Derived>::copyCoeff(int index, const MatrixBase<OtherDeri
 }
 
 template<typename Derived>
-template<typename OtherDerived, int LoadStoreMode>
+template<typename OtherDerived, int StoreMode, int LoadMode>
 inline void MatrixBase<Derived>::copyPacket(int row, int col, const MatrixBase<OtherDerived>& other)
 {
   ei_internal_assert(row >= 0 && row < rows()
                      && col >= 0 && col < cols());
-  derived().template writePacket<LoadStoreMode>(row, col,
-    other.derived().template packet<LoadStoreMode>(row, col));
+  derived().template writePacket<StoreMode>(row, col,
+    other.derived().template packet<LoadMode>(row, col));
 }
 
 template<typename Derived>
-template<typename OtherDerived, int LoadStoreMode>
+template<typename OtherDerived, int StoreMode, int LoadMode>
 inline void MatrixBase<Derived>::copyPacket(int index, const MatrixBase<OtherDerived>& other)
 {
   ei_internal_assert(index >= 0 && index < size());
-  derived().template writePacket<LoadStoreMode>(index,
-    other.derived().template packet<LoadStoreMode>(index));
+  derived().template writePacket<StoreMode>(index,
+    other.derived().template packet<LoadMode>(index));
 }
 
 #endif // EIGEN_COEFFS_H
diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index 51c1f9e43..dcf2c9063 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -67,7 +67,7 @@ struct ei_traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
     MaxColsAtCompileTime = Lhs::MaxColsAtCompileTime,
     Flags = (int(LhsFlags) | int(RhsFlags)) & (
         HereditaryBits
-      | (int(LhsFlags) & int(RhsFlags) & LinearAccessBit)
+      | (int(LhsFlags) & int(RhsFlags) & (LinearAccessBit | AlignedBit))
       | (ei_functor_traits<BinaryOp>::PacketAccess && ((int(LhsFlags) & RowMajorBit)==(int(RhsFlags) & RowMajorBit))
         ? (int(LhsFlags) & int(RhsFlags) & PacketAccessBit) : 0)),
     CoeffReadCost = LhsCoeffReadCost + RhsCoeffReadCost + ei_functor_traits<BinaryOp>::Cost
diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h
index bb354958e..e9aeb6608 100644
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@@ -55,7 +55,7 @@ struct ei_traits<CwiseUnaryOp<UnaryOp, MatrixType> >
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
     Flags = (MatrixTypeFlags & (
-      HereditaryBits | LinearAccessBit
+      HereditaryBits | LinearAccessBit | AlignedBit
       | (ei_functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0))),
     CoeffReadCost = MatrixTypeCoeffReadCost + ei_functor_traits<UnaryOp>::Cost
   };
diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index a5d2f0ba3..9bdff50b3 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -123,7 +123,9 @@ struct ei_dot_vec_unroller<Derived1, Derived2, Index, Stop, true>
     row1 = Derived1::RowsAtCompileTime == 1 ? 0 : Index,
     col1 = Derived1::RowsAtCompileTime == 1 ? Index : 0,
     row2 = Derived2::RowsAtCompileTime == 1 ? 0 : Index,
-    col2 = Derived2::RowsAtCompileTime == 1 ? Index : 0
+    col2 = Derived2::RowsAtCompileTime == 1 ? Index : 0,
+    alignment1 = (Derived1::Flags & AlignedBit) ? Aligned : Unaligned,
+    alignment2 = (Derived2::Flags & AlignedBit) ? Aligned : Unaligned
   };
 
   typedef typename Derived1::Scalar Scalar;
@@ -131,7 +133,7 @@ struct ei_dot_vec_unroller<Derived1, Derived2, Index, Stop, true>
 
   inline static PacketScalar run(const Derived1& v1, const Derived2& v2)
   {
-    return ei_pmul(v1.template packet<Aligned>(row1, col1), v2.template packet<Aligned>(row2, col2));
+    return ei_pmul(v1.template packet<alignment1>(row1, col1), v2.template packet<alignment2>(row2, col2));
   }
 };
 
@@ -175,20 +177,22 @@ struct ei_dot_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
     const int size = v1.size();
     const int packetSize = ei_packet_traits<Scalar>::size;
     const int alignedSize = (size/packetSize)*packetSize;
+    const int alignment1 = (Derived1::Flags & AlignedBit) ? Aligned : Unaligned;
+    const int alignment2 = (Derived2::Flags & AlignedBit) ? Aligned : Unaligned;
     Scalar res;
 
     // do the vectorizable part of the sum
     if(size >= packetSize)
     {
       PacketScalar packet_res = ei_pmul(
-                                  v1.template packet<Aligned>(0),
-                                  v2.template packet<Aligned>(0)
+                                  v1.template packet<alignment1>(0),
+                                  v2.template packet<alignment2>(0)
                                 );
       for(int index = packetSize; index<alignedSize; index += packetSize)
       {
         packet_res = ei_pmadd(
-                       v1.template packet<Aligned>(index),
-                       v2.template packet<Aligned>(index),
+                       v1.template packet<alignment1>(index),
+                       v2.template packet<alignment2>(index),
                        packet_res
                      );
       }
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index f8d924f7d..a1953993f 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -2,6 +2,7 @@
 // for linear algebra. Eigen itself is part of the KDE project.
 //
 // Copyright (C) 2006-2008 Benoit Jacob <jacob@math.jussieu.fr>
+// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
 //
 // Eigen is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
@@ -29,8 +30,8 @@
   *
   * \brief A matrix or vector expression mapping an existing array of data.
   *
-  * \param Alignment can be either Aligned or Unaligned. Tells whether the array is suitably aligned for
-  *                  vectorization on the present CPU architecture. Defaults to Unaligned.
+  * \param _PacketAccess controls whether vectorized aligned loads or stores are allowed (Aligned)
+  *                      or forced to unaligned (Unaligned). Defaults to Unaligned.
   *
   * This class represents a matrix or vector expression mapping an existing array of data.
   * It can be used to let Eigen interface without any overhead with non-Eigen data structures,
@@ -40,117 +41,43 @@
   *
   * \sa Matrix::map()
   */
-template<typename MatrixType, int Alignment>
-struct ei_traits<Map<MatrixType, Alignment> >
+template<typename MatrixType, int _PacketAccess>
+struct ei_traits<Map<MatrixType, _PacketAccess> > : public ei_traits<MatrixType>
 {
-  typedef typename MatrixType::Scalar Scalar;
   enum {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Flags = MatrixType::Flags,
-    CoeffReadCost = NumTraits<Scalar>::ReadCost
+    PacketAccess = _PacketAccess,
+    Flags = ei_traits<MatrixType>::Flags & ~AlignedBit
   };
+  typedef typename ei_meta_if<int(PacketAccess)==Aligned,
+                              Map<MatrixType, _PacketAccess>&,
+                              Map<MatrixType, Aligned> >::ret AlignedDerivedType;
 };
 
-template<typename MatrixType, int Alignment> class Map
-  : public MatrixBase<Map<MatrixType, Alignment> >
+template<typename MatrixType, int PacketAccess> class Map
+  : public MapBase<Map<MatrixType, PacketAccess> >
 {
   public:
 
-    EIGEN_GENERIC_PUBLIC_INTERFACE(Map)
-
-    inline int rows() const { return m_rows.value(); }
-    inline int cols() const { return m_cols.value(); }
+    _EIGEN_GENERIC_PUBLIC_INTERFACE(Map, MapBase<Map>)
+    typedef typename ei_traits<Map>::AlignedDerivedType AlignedDerivedType;
 
     inline int stride() const { return this->innerSize(); }
 
-    inline const Scalar& coeff(int row, int col) const
+    AlignedDerivedType allowAligned()
     {
-      if(Flags & RowMajorBit)
-        return m_data[col + row * m_cols.value()];
-      else // column-major
-        return m_data[row + col * m_rows.value()];
+      if (PacketAccess==Aligned)
+        return *this;
+      else
+        return Map<MatrixType,Aligned>(Base::m_data, Base::m_rows.value(), Base::m_cols.value());
     }
 
-    inline Scalar& coeffRef(int row, int col)
-    {
-      if(Flags & RowMajorBit)
-        return const_cast<Scalar*>(m_data)[col + row * m_cols.value()];
-      else // column-major
-        return const_cast<Scalar*>(m_data)[row + col * m_rows.value()];
-    }
+    inline Map(const Scalar* data) : Base(data) {}
 
-    inline const Scalar& coeff(int index) const
-    {
-      return m_data[index];
-    }
+    inline Map(const Scalar* data, int size) : Base(data, size) {}
 
-    inline Scalar& coeffRef(int index)
-    {
-      return *const_cast<Scalar*>(m_data + index);
-    }
-
-    template<int LoadMode>
-    inline PacketScalar packet(int row, int col) const
-    {
-      return ei_ploadt<Scalar, LoadMode == Aligned ? Alignment : Unaligned>
-               (m_data + (Flags & RowMajorBit
-                         ? col + row * m_cols.value()
-                         : row + col * m_rows.value()));
-    }
-
-    template<int LoadMode>
-    inline PacketScalar packet(int index) const
-    {
-      return ei_ploadt<Scalar, LoadMode == Aligned ? Alignment : Unaligned>(m_data + index);
-    }
-
-    template<int StoreMode>
-    inline void writePacket(int row, int col, const PacketScalar& x)
-    {
-      ei_pstoret<Scalar, PacketScalar, StoreMode == Aligned ? Alignment : Unaligned>
-               (const_cast<Scalar*>(m_data) + (Flags & RowMajorBit
-                         ? col + row * m_cols.value()
-                         : row + col * m_rows.value()), x);
-    }
-
-    template<int StoreMode>
-    inline void writePacket(int index, const PacketScalar& x)
-    {
-      ei_pstoret<Scalar, PacketScalar, StoreMode == Aligned ? Alignment : Unaligned>
-        (const_cast<Scalar*>(m_data) + index, x);
-    }
-
-    inline Map(const Scalar* data) : m_data(data), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
-    {
-      EIGEN_STATIC_ASSERT_FIXED_SIZE(MatrixType)
-    }
-
-    inline Map(const Scalar* data, int size)
-            : m_data(data),
-              m_rows(RowsAtCompileTime == Dynamic ? size : RowsAtCompileTime),
-              m_cols(ColsAtCompileTime == Dynamic ? size : ColsAtCompileTime)
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(MatrixType)
-      ei_assert(size > 0);
-      ei_assert(SizeAtCompileTime == Dynamic || SizeAtCompileTime == size);
-    }
-
-    inline Map(const Scalar* data, int rows, int cols)
-            : m_data(data), m_rows(rows), m_cols(cols)
-    {
-      ei_assert(rows > 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
-               && cols > 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols));
-    }
+    inline Map(const Scalar* data, int rows, int cols) : Base(data, rows, cols) {}
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
-
-  protected:
-    const Scalar* m_data;
-    const ei_int_if_dynamic<RowsAtCompileTime> m_rows;
-    const ei_int_if_dynamic<ColsAtCompileTime> m_cols;
 };
 
 /** Constructor copying an existing array of data.
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
new file mode 100644
index 000000000..0b54ca7b8
--- /dev/null
+++ b/Eigen/src/Core/MapBase.h
@@ -0,0 +1,167 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob@math.jussieu.fr>
+// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_MAPBASE_H
+#define EIGEN_MAPBASE_H
+
+/** \internal
+  *
+  * \class MapBase
+  *
+  * \brief Base class for Map and Block expression with direct access
+  *
+  * \sa class Map, class Block
+  */
+template<typename Derived> class MapBase
+  : public MatrixBase<Derived>
+{
+  public:
+
+    typedef MatrixBase<Derived> Base;
+    enum {
+      IsRowMajor = int(ei_traits<Derived>::Flags) & RowMajorBit ? 1 : 0,
+      PacketAccess = ei_traits<Derived>::PacketAccess,
+      RowsAtCompileTime = ei_traits<Derived>::RowsAtCompileTime,
+      ColsAtCompileTime = ei_traits<Derived>::ColsAtCompileTime,
+      SizeAtCompileTime = Base::SizeAtCompileTime
+    };
+    
+    typedef typename ei_traits<Derived>::AlignedDerivedType AlignedDerivedType;
+    typedef typename ei_traits<Derived>::Scalar Scalar;
+    typedef typename Base::PacketScalar PacketScalar;
+    using Base::derived;
+
+    inline int rows() const { return m_rows.value(); }
+    inline int cols() const { return m_cols.value(); }
+
+    inline int stride() const { return derived().stride(); }
+    AlignedDerivedType allowAligned() { return derived().allowAligned(); }
+
+    inline const Scalar& coeff(int row, int col) const
+    {
+      if(IsRowMajor)
+        return m_data[col + row * stride()];
+      else // column-major
+        return m_data[row + col * stride()];
+    }
+
+    inline Scalar& coeffRef(int row, int col)
+    {
+      if(IsRowMajor)
+        return const_cast<Scalar*>(m_data)[col + row * stride()];
+      else // column-major
+        return const_cast<Scalar*>(m_data)[row + col * stride()];
+    }
+    
+    inline const Scalar coeff(int index) const
+    {
+      ei_assert(Derived::IsVectorAtCompileTime || (ei_traits<Derived>::Flags & LinearAccessBit));
+      if ( ((RowsAtCompileTime == 1) == IsRowMajor) )
+        return m_data[index];
+      else
+        return m_data[index*stride()];
+    }
+
+    inline Scalar& coeffRef(int index)
+    {
+      return *const_cast<Scalar*>(m_data + index);
+    }
+
+    template<int LoadMode>
+    inline PacketScalar packet(int row, int col) const
+    {
+      return ei_ploadt<Scalar, int(PacketAccess) == Aligned ? Aligned : LoadMode>
+               (m_data + (IsRowMajor ? col + row * stride()
+                                     : row + col * stride()));
+    }
+
+    template<int LoadMode>
+    inline PacketScalar packet(int index) const
+    {
+      return ei_ploadt<Scalar, int(PacketAccess) == Aligned ? Aligned : LoadMode>(m_data + index);
+    }
+
+    template<int StoreMode>
+    inline void writePacket(int row, int col, const PacketScalar& x)
+    {
+      ei_pstoret<Scalar, PacketScalar, int(PacketAccess) == Aligned ? Aligned : StoreMode>
+               (const_cast<Scalar*>(m_data) + (IsRowMajor ? col + row * stride()
+                                                          : row + col * stride()), x);
+    }
+
+    template<int StoreMode>
+    inline void writePacket(int index, const PacketScalar& x)
+    {
+      ei_pstoret<Scalar, PacketScalar, int(PacketAccess) == Aligned ? Aligned : StoreMode>
+        (const_cast<Scalar*>(m_data) + index, x);
+    }
+
+    inline MapBase(const Scalar* data) : m_data(data), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
+    {
+      EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
+    }
+
+    inline MapBase(const Scalar* data, int size)
+            : m_data(data),
+              m_rows(RowsAtCompileTime == Dynamic ? size : RowsAtCompileTime),
+              m_cols(ColsAtCompileTime == Dynamic ? size : ColsAtCompileTime)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+      ei_assert(size > 0);
+      ei_assert(SizeAtCompileTime == Dynamic || SizeAtCompileTime == size);
+    }
+
+    inline MapBase(const Scalar* data, int rows, int cols)
+            : m_data(data), m_rows(rows), m_cols(cols)
+    {
+      ei_assert(rows > 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
+             && cols > 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols));
+    }
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(MapBase)
+
+//     EIGEN_INHERIT_ASSIGNMENT_OPERATOR(MapBase, =)
+
+    template<typename OtherDerived>
+    Derived& operator+=(const MatrixBase<OtherDerived>& other)
+    { return derived() = allowAligned() + other; }
+    
+    template<typename OtherDerived>
+    Derived& operator-=(const MatrixBase<OtherDerived>& other)
+    { return derived() = allowAligned() - other; }
+
+    Derived& operator*=(const Scalar& other)
+    { return derived() = allowAligned() * other; }
+    
+    Derived& operator/=(const Scalar& other)
+    { return derived() = allowAligned() / other; }
+
+  protected:
+    const Scalar* __restrict__ m_data;
+    const ei_int_if_dynamic<RowsAtCompileTime> m_rows;
+    const ei_int_if_dynamic<ColsAtCompileTime> m_cols;
+};
+
+#endif // EIGEN_MAPBASE_H
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 0243faaed..25a4c8b08 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -257,9 +257,9 @@ template<typename Derived> class MatrixBase
     void copyCoeff(int row, int col, const MatrixBase<OtherDerived>& other);
     template<typename OtherDerived>
     void copyCoeff(int index, const MatrixBase<OtherDerived>& other);
-    template<typename OtherDerived, int LoadStoreMode>
+    template<typename OtherDerived, int StoreMode, int LoadMode>
     void copyPacket(int row, int col, const MatrixBase<OtherDerived>& other);
-    template<typename OtherDerived, int LoadStoreMode>
+    template<typename OtherDerived, int StoreMode, int LoadMode>
     void copyPacket(int index, const MatrixBase<OtherDerived>& other);
 
     template<int LoadMode>
diff --git a/Eigen/src/Core/Sum.h b/Eigen/src/Core/Sum.h
index fa75429c8..6c7280800 100644
--- a/Eigen/src/Core/Sum.h
+++ b/Eigen/src/Core/Sum.h
@@ -33,17 +33,22 @@
 template<typename Derived>
 struct ei_sum_traits
 {
+private:
+  enum {
+    PacketSize = ei_packet_traits<typename Derived::Scalar>::size
+  };
+
 public:
   enum {
     Vectorization = (int(Derived::Flags)&ActualPacketAccessBit)
                  && (int(Derived::Flags)&LinearAccessBit)
+                 && (int(Derived::SizeAtCompileTime)>2*PacketSize)
                   ? LinearVectorization
                   : NoVectorization
   };
 
 private:
   enum {
-    PacketSize = ei_packet_traits<typename Derived::Scalar>::size,
     Cost = Derived::SizeAtCompileTime * Derived::CoeffReadCost
            + (Derived::SizeAtCompileTime-1) * NumTraits<typename Derived::Scalar>::AddCost,
     UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Vectorization) == int(NoVectorization) ? 1 : int(PacketSize))
@@ -131,7 +136,8 @@ struct ei_sum_vec_unroller<Derived, Index, Stop, true>
         : Index % Derived::RowsAtCompileTime,
     col = int(Derived::Flags)&RowMajorBit
         ? Index % int(Derived::ColsAtCompileTime)
-        : Index / Derived::RowsAtCompileTime
+        : Index / Derived::RowsAtCompileTime,
+    alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned
   };
 
   typedef typename Derived::Scalar Scalar;
@@ -139,7 +145,7 @@ struct ei_sum_vec_unroller<Derived, Index, Stop, true>
 
   inline static PacketScalar run(const Derived &mat)
   {
-    return mat.template packet<Aligned>(row, col);
+    return mat.template packet<alignment>(row, col);
   }
 };
 
@@ -185,14 +191,21 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
   {
     const int size = mat.size();
     const int packetSize = ei_packet_traits<Scalar>::size;
-    const int alignedSize = (size/packetSize)*packetSize;
+    const int alignedStart =  (Derived::Flags & AlignedBit)
+                           || !(Derived::Flags & DirectAccessBit)
+                           ? 0
+                           : ei_alignmentOffset(&mat.const_cast_derived().coeffRef(0), size);
+    const int alignment = (Derived::Flags & DirectAccessBit) || (Derived::Flags & AlignedBit)
+                        ? Aligned : Unaligned;
+    const int alignedSize = ((size-alignedStart)/packetSize)*packetSize;
+    const int alignedEnd = alignedStart + alignedSize;
     Scalar res;
 
-    if(size >= packetSize)
+    if(Derived::SizeAtCompileTime>=2*packetSize && alignedSize >= 2*packetSize)
     {
-      PacketScalar packet_res = mat.template packet<Aligned>(0, 0);
-      for(int index = packetSize; index < alignedSize; index += packetSize)
-        packet_res = ei_padd(packet_res, mat.template packet<Aligned>(index));
+      PacketScalar packet_res = mat.template packet<alignment>(alignedStart, alignedStart);
+      for(int index = alignedStart + packetSize; index < alignedEnd; index += packetSize)
+        packet_res = ei_padd(packet_res, mat.template packet<alignment>(index));
 
       res = ei_predux(packet_res);
     }
@@ -202,10 +215,11 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
       res = Scalar(0);
     }
 
-    for(int index = alignedSize; index < size; index++)
-    {
+    for(int index = alignedEnd; index < size; index++)
+      res += mat.coeff(index);
+
+    for(int index = alignedEnd; index < size; index++)
       res += mat.coeff(index);
-    }
 
     return res;
   }
diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h
index b58fd1279..31e8170f5 100644
--- a/Eigen/src/Core/Swap.h
+++ b/Eigen/src/Core/Swap.h
@@ -59,6 +59,16 @@ template<typename ExpressionType> class SwapWrapper
     inline int cols() const { return m_expression.cols(); }
     inline int stride() const { return m_expression.stride(); }
 
+    inline Scalar& coeffRef(int row, int col)
+    {
+      return m_expression.const_cast_derived().coeffRef(row, col);
+    }
+
+    inline Scalar& coeffRef(int index)
+    {
+      return m_expression.const_cast_derived().coeffRef(index);
+    }
+
     template<typename OtherDerived>
     void copyCoeff(int row, int col, const MatrixBase<OtherDerived>& other)
     {
@@ -80,29 +90,29 @@ template<typename ExpressionType> class SwapWrapper
       _other.coeffRef(index) = tmp;
     }
 
-    template<typename OtherDerived, int LoadStoreMode>
+    template<typename OtherDerived, int StoreMode, int LoadMode>
     void copyPacket(int row, int col, const MatrixBase<OtherDerived>& other)
     {
       OtherDerived& _other = other.const_cast_derived();
       ei_internal_assert(row >= 0 && row < rows()
                         && col >= 0 && col < cols());
-      Packet tmp = m_expression.template packet<LoadStoreMode>(row, col);
-      m_expression.template writePacket<LoadStoreMode>(row, col,
-        _other.template packet<LoadStoreMode>(row, col)
+      Packet tmp = m_expression.template packet<StoreMode>(row, col);
+      m_expression.template writePacket<StoreMode>(row, col,
+        _other.template packet<LoadMode>(row, col)
       );
-      _other.template writePacket<LoadStoreMode>(row, col, tmp);
+      _other.template writePacket<LoadMode>(row, col, tmp);
     }
 
-    template<typename OtherDerived, int LoadStoreMode>
+    template<typename OtherDerived, int StoreMode, int LoadMode>
     void copyPacket(int index, const MatrixBase<OtherDerived>& other)
     {
       OtherDerived& _other = other.const_cast_derived();
       ei_internal_assert(index >= 0 && index < m_expression.size());
-      Packet tmp = m_expression.template packet<LoadStoreMode>(index);
-      m_expression.template writePacket<LoadStoreMode>(index,
-        _other.template packet<LoadStoreMode>(index)
+      Packet tmp = m_expression.template packet<StoreMode>(index);
+      m_expression.template writePacket<StoreMode>(index,
+        _other.template packet<LoadMode>(index)
       );
-      _other.template writePacket<LoadStoreMode>(index, tmp);
+      _other.template writePacket<LoadMode>(index, tmp);
     }
 
   protected:
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index 24c653e2e..ea3994544 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -119,42 +119,47 @@ const unsigned int LinearAccessBit = 0x10;
   */
 const unsigned int DirectAccessBit = 0x20;
 
+/** \ingroup flags
+  *
+  * means the first coefficient packet is guaranteed to be aligned */
+const unsigned int AlignedBit = 0x40;
+
 /** \ingroup flags
   *
   * means all diagonal coefficients are equal to 0 */
-const unsigned int ZeroDiagBit = 0x40;
+const unsigned int ZeroDiagBit = 0x80;
 
 /** \ingroup flags
   *
   * means all diagonal coefficients are equal to 1 */
-const unsigned int UnitDiagBit = 0x80;
+const unsigned int UnitDiagBit = 0x100;
 
 /** \ingroup flags
   *
   * means the matrix is selfadjoint (M=M*). */
-const unsigned int SelfAdjointBit = 0x100;
+const unsigned int SelfAdjointBit = 0x200;
 
 /** \ingroup flags
   *
   * means the strictly lower triangular part is 0 */
-const unsigned int UpperTriangularBit = 0x200;
+const unsigned int UpperTriangularBit = 0x400;
 
 /** \ingroup flags
   *
   * means the strictly upper triangular part is 0 */
-const unsigned int LowerTriangularBit = 0x400;
+const unsigned int LowerTriangularBit = 0x800;
 
 /** \ingroup flags
   *
   * means the expression includes sparse matrices and the sparse path has to be taken. */
-const unsigned int SparseBit = 0x800;
+const unsigned int SparseBit = 0x1000;
 
 /** \ingroup flags
   *
   * currently unused. Means the matrix probably has a very big size.
   * Could eventually be used as a hint to determine which algorithms
   * to use. */
-const unsigned int LargeBit = 0x1000;
+const unsigned int LargeBit = 0x2000;
 
 // list of flags that are inherited by default
 const unsigned int HereditaryBits = RowMajorBit
@@ -175,15 +180,21 @@ const unsigned int UnitUpper = UpperTriangularBit | UnitDiagBit;
 const unsigned int UnitLower = LowerTriangularBit | UnitDiagBit;
 const unsigned int Diagonal = Upper | Lower;
 
-enum { Aligned=0, Unaligned=1 };
+enum { Aligned=0, Unaligned=1, Unknown=2 };
 enum { ConditionalJumpCost = 5 };
 enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };
 enum DirectionType { Vertical, Horizontal };
 enum ProductEvaluationMode { NormalProduct, CacheFriendlyProduct, DiagonalProduct, SparseProduct };
 
 enum {
+  /** \internal Equivalent to a slice vectorization for fixed-size matrices having good alignement
+    * and good size */
   InnerVectorization,
+  /** \internal Vectorization path using a single loop plus scalar loops for the
+    * unaligned boundaries */
   LinearVectorization,
+  /** \internal Generic vectorization path using one vectorized loop per row/column with some
+    * scalar loops to handle the unaligned boundaries */
   SliceVectorization,
   NoVectorization
 };
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index a886a90d0..7a1f95443 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -43,8 +43,8 @@ template<typename ExpressionType, unsigned int Added, unsigned int Removed> clas
 template<typename ExpressionType> class NestByValue;
 template<typename ExpressionType> class SwapWrapper;
 template<typename MatrixType> class Minor;
-template<typename MatrixType, int BlockRows=Dynamic, int BlockCols=Dynamic,
-         int DirectAccessStatus = ei_traits<MatrixType>::Flags&DirectAccessBit> class Block;
+template<typename MatrixType, int BlockRows=Dynamic, int BlockCols=Dynamic, int PacketAccess=Unaligned,
+         int _DirectAccessStatus = ei_traits<MatrixType>::Flags&DirectAccessBit> class Block;
 template<typename MatrixType> class Transpose;
 template<typename MatrixType> class Conjugate;
 template<typename NullaryOp, typename MatrixType>         class CwiseNullaryOp;
@@ -53,7 +53,7 @@ template<typename BinaryOp,  typename Lhs, typename Rhs>  class CwiseBinaryOp;
 template<typename Lhs, typename Rhs, int ProductMode> class Product;
 template<typename CoeffsVectorType> class DiagonalMatrix;
 template<typename MatrixType> class DiagonalCoeffs;
-template<typename MatrixType, int Alignment = Unaligned> class Map;
+template<typename MatrixType, int PacketAccess = Unaligned> class Map;
 template<typename MatrixType, unsigned int Mode> class Part;
 template<typename MatrixType, unsigned int Mode> class Extract;
 template<typename ExpressionType> class Cwise;
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 2fb401c6c..9d844d222 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -168,12 +168,14 @@ class ei_corrected_matrix_flags
            packet_access_bit
             = ei_packet_traits<Scalar>::size > 1
               && (is_big || linear_size%ei_packet_traits<Scalar>::size==0)
-              ? PacketAccessBit : 0
+              ? PacketAccessBit : 0,
+           aligned_bit = packet_access_bit
+                         && (is_big || linear_size%ei_packet_traits<Scalar>::size==0) ? AlignedBit : 0
     };
 
   public:
     enum { ret = (SuggestedFlags & ~(EvalBeforeNestingBit | EvalBeforeAssigningBit | PacketAccessBit | RowMajorBit))
-                                    | LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit
+                                    | LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit | aligned_bit
     };
 };
 
diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h
index e17e9ff4a..b2065fdcc 100644
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@@ -105,8 +105,11 @@ public:
 
   /** Constructs and initializes the quaternion \f$ w+xi+yj+zk \f$ from
     * its four coefficients \a w, \a x, \a y and \a z.
+    * 
+    * \warning Note the order of the arguments: the real \a w coefficient first,
+    * while internally the coefficients are stored in the following order:
+    * [\c x, \c y, \c z, \c w]
     */
-  // FIXME what is the prefered order: w x,y,z or x,y,z,w ?
   inline Quaternion(Scalar w, Scalar x, Scalar y, Scalar z)
   { m_coeffs << x, y, z, w; }
 
@@ -313,8 +316,8 @@ inline Quaternion<Scalar>& Quaternion<Scalar>::setFromTwoVectors(const MatrixBas
 }
 
 /** \returns the multiplicative inverse of \c *this
-  * Note that in most cases, i.e., if you simply want the opposite
-  * rotation, it is enough to use the conjugate.
+  * Note that in most cases, i.e., if you simply want the opposite rotation,
+  * and/or the quaternion is normalized, then it is enough to use the conjugate.
   *
   * \sa Quaternion::conjugate()
   */