diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h
index bad018e8e..8c012b58f 100644
--- a/Eigen/src/Core/CacheFriendlyProduct.h
+++ b/Eigen/src/Core/CacheFriendlyProduct.h
@@ -359,19 +359,6 @@ static void ei_cache_friendly_product(
 
 #endif // EIGEN_EXTERN_INSTANTIATIONS
 
-template<typename Scalar>
-inline static int ei_alignmentOffset(const Scalar* ptr, int maxOffset)
-{
-  typedef typename ei_packet_traits<Scalar>::type Packet;
-  const int PacketSize = ei_packet_traits<Scalar>::size;
-  const int PacketAlignedMask = PacketSize-1;
-  const bool Vectorized = PacketSize>1;
-  return Vectorized
-          ? std::min<int>( (PacketSize - ((size_t(ptr)/sizeof(Scalar)) & PacketAlignedMask))
-                           & PacketAlignedMask, maxOffset)
-          : 0;
-}
-
 /* Optimized col-major matrix * vector product:
  * This algorithm processes 4 columns at onces that allows to both reduce
  * the number of load/stores of the result by a factor 4 and to reduce
@@ -420,7 +407,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
 
   // we cannot assume the first element is aligned because of sub-matrices
   const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size);
-  ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0);
+  ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size<PacketSize || PacketSize==1);
 
   // find how many columns do we have to skip to be aligned with the result (if possible)
   int skipColumns=0;
@@ -438,7 +425,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
     // note that the skiped columns are processed later.
   }
 
-  ei_internal_assert((alignmentPattern==NoneAligned)
+  ei_internal_assert((alignmentPattern==NoneAligned) || PacketSize==1
     || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0);
 
   int columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
@@ -585,7 +572,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
 
   // we cannot assume the first element is aligned because of sub-matrices
   const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size);
-  ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0);
+  ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0  || PacketSize==1 || size<PacketSize);
   // find how many rows do we have to skip to be aligned with rhs (if possible)
   int skipRows=0;
   for (; skipRows<PacketSize && alignedStart != lhsAlignmentOffset + alignmentStep*skipRows; ++skipRows)
@@ -601,7 +588,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
     skipRows = std::min(skipRows,res.size());
     // note that the skiped columns are processed later.
   }
-  ei_internal_assert((alignmentPattern==NoneAligned)
+  ei_internal_assert((alignmentPattern==NoneAligned) || PacketSize==1
     || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0);
 
   int rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
diff --git a/Eigen/src/Core/DummyPacketMath.h b/Eigen/src/Core/DummyPacketMath.h
index 1a4ecbfa0..5aff21fe4 100644
--- a/Eigen/src/Core/DummyPacketMath.h
+++ b/Eigen/src/Core/DummyPacketMath.h
@@ -120,5 +120,19 @@ template <typename Scalar, typename Packet, int LoadMode> inline void ei_pstoret
     ei_pstoreu(to, from);
 }
 
+/** \internal \returns the number of elements which have to be skipped such that data are aligned */
+template<typename Scalar>
+inline static int ei_alignmentOffset(const Scalar* ptr, int maxOffset)
+{
+  typedef typename ei_packet_traits<Scalar>::type Packet;
+  const int PacketSize = ei_packet_traits<Scalar>::size;
+  const int PacketAlignedMask = PacketSize-1;
+  const bool Vectorized = PacketSize>1;
+  return Vectorized
+          ? std::min<int>( (PacketSize - ((size_t(ptr)/sizeof(Scalar)) & PacketAlignedMask))
+                           & PacketAlignedMask, maxOffset)
+          : 0;
+}
+
 #endif // EIGEN_DUMMY_PACKET_MATH_H
 
diff --git a/Eigen/src/Core/InverseProduct.h b/Eigen/src/Core/InverseProduct.h
index cfab54228..cd27c4ddf 100755
--- a/Eigen/src/Core/InverseProduct.h
+++ b/Eigen/src/Core/InverseProduct.h
@@ -98,6 +98,8 @@ struct ei_trisolve_selector<Lhs,Rhs,Upper,RowMajor>
 };
 
 // forward substitution, col-major
+// FIXME the Lower and Upper specialization could be merged using a small helper class
+// performing reflexions on the coordinates...
 template<typename Lhs, typename Rhs>
 struct ei_trisolve_selector<Lhs,Rhs,Lower,ColMajor>
 {
@@ -138,6 +140,8 @@ struct ei_trisolve_selector<Lhs,Rhs,Lower,ColMajor>
          *   other.col(c).end(size-endBlock) += (lhs.block(endBlock, startBlock, size-endBlock, endBlock-startBlock)
          *                                       * other.col(c).block(startBlock,endBlock-startBlock)).lazy();
          */
+        // FIXME this is cool but what about conjugate/adjoint expressions ? do we want to evaluate them ?
+        // this is a more general problem though.
         ei_cache_friendly_product_colmajor_times_vector(
           size-endBlock, &(lhs.const_cast_derived().coeffRef(endBlock,startBlock)), lhs.stride(),
           btmp, &(other.coeffRef(endBlock,c)));
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 9a0365939..60a1d0a35 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -379,6 +379,7 @@ struct ei_product_coeff_vectorized_dyn_selector
 };
 
 // NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
+// NOTE maybe they are now useless since we have a specialization for Block<Matrix>
 template<typename Lhs, typename Rhs, int RhsCols>
 struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
 {
@@ -406,7 +407,7 @@ struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
 template<typename Lhs, typename Rhs>
 struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
 {
-  inline static void run(int row, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  inline static void run(int /*row*/, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
   {
     res = ei_dot_impl<
       Lhs,