From bf5326c3cad4b92082bb347a09e9a9ab6d50075b Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 8 May 2008 08:12:52 +0000
Subject: [PATCH] * Added ReferencableBit flag to known if coeffRef is
 available.   (needed by the new product implementation) * Make the packet*
 members template to support aligned and unaligned   access. This makes Block
 vectorizable. Combined with ReferencableBit,   we should be able to determine
 at runtime (in some specific cases) if   an aligned vectorization is possible
 or not. * Improved the new product implementation to robustly handle all
 cases,   it now passes all the tests. * Renamed the packet version ei_predux
 to ei_preduxp to avoid name collision.

---
 Eigen/Core                      |   5 +
 Eigen/src/Core/Block.h          |   6 +-
 Eigen/src/Core/Map.h            |   2 +-
 Eigen/src/Core/PacketMath.h     |   4 +-
 Eigen/src/Core/PacketMath_SSE.h |  40 +++++-
 Eigen/src/Core/Product.h        |   6 +
 Eigen/src/Core/ProductWIP.h     | 235 +++++++++++++++++---------------
 Eigen/src/Core/util/Constants.h |   6 +-
 Eigen/src/Core/util/Macros.h    |   7 +
 9 files changed, 191 insertions(+), 120 deletions(-)
diff --git a/Eigen/Core b/Eigen/Core
index 999bb96f2..cc05c308e 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -1,13 +1,18 @@
 #ifndef EIGEN_CORE_H
 #define EIGEN_CORE_H
 
+#define EIGEN_WIP_PRODUCT
+
 #ifndef EIGEN_DONT_VECTORIZE
 #if ((defined __SSE2__) && ( (!defined __GNUC__) || (__GNUC__>=4 && __GNUC_MINOR__>=2)))
 #define EIGEN_VECTORIZE
 #define EIGEN_VECTORIZE_SSE
 #include <emmintrin.h>
 #include <xmmintrin.h>
+// SSE3:
 #include <pmmintrin.h>
+// SSSE3:
+//#include <tmmintrin.h>
 #endif
 #ifdef __ALTIVEC__  // There are zero chances of both __SSE2__ AND __ALTIVEC__ been defined
 #define EIGEN_VECTORIZE
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 247a46b40..00bb375a9 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -71,7 +71,7 @@ struct ei_traits<Block<MatrixType, BlockRows, BlockCols> >
                       || (ColsAtCompileTime != Dynamic && MatrixType::ColsAtCompileTime == Dynamic))
                       ? ~LargeBit
                       : ~(unsigned int)0,
-    Flags = MatrixType::Flags & DefaultLostFlagMask & FlagsMaskLargeBit,
+    Flags = MatrixType::Flags & (DefaultLostFlagMask | VectorizableBit | ReferencableBit) & FlagsMaskLargeBit,
     CoeffReadCost = MatrixType::CoeffReadCost
   };
 };
@@ -146,13 +146,13 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block
     template<int LoadMode>
     PacketScalar _packetCoeff(int row, int col) const
     {
-      return m_matrix.packetCoeff<UnAligned>(row + m_startRow.value(), col + m_startCol.value());
+      return m_matrix.template packetCoeff<UnAligned>(row + m_startRow.value(), col + m_startCol.value());
     }
 
     template<int LoadMode>
     void _writePacketCoeff(int row, int col, const PacketScalar& x)
     {
-      m_matrix.const_cast_derived().writePacketCoeff<UnAligned>(row + m_startRow.value(), col + m_startCol.value(), x);
+      m_matrix.const_cast_derived().template writePacketCoeff<UnAligned>(row + m_startRow.value(), col + m_startCol.value(), x);
     }
 
   protected:
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index 266da0cfe..e9f65cd7a 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -47,7 +47,7 @@ struct ei_traits<Map<MatrixType> >
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Flags = MatrixType::Flags & DefaultLostFlagMask,
+    Flags = MatrixType::Flags & (DefaultLostFlagMask | ReferencableBit),
     CoeffReadCost = NumTraits<Scalar>::ReadCost
   };
 };
diff --git a/Eigen/src/Core/PacketMath.h b/Eigen/src/Core/PacketMath.h
index 59c143ede..267192482 100644
--- a/Eigen/src/Core/PacketMath.h
+++ b/Eigen/src/Core/PacketMath.h
@@ -71,10 +71,10 @@ template <typename Scalar> inline void ei_pstoreu(Scalar* to, const Scalar& from
 template <typename Scalar> inline Scalar ei_pfirst(const Scalar& a) { return a; }
 
 /** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */
-// template <typename Scalar> inline Scalar ei_predux(const Scalar* vecs) { return vecs[0]; }
+template <typename Scalar> inline Scalar ei_preduxp(const Scalar* vecs) { return vecs[0]; }
 
 /** \internal \returns the sum of the elements of \a a*/
-// template <typename Scalar> inline Scalar ei_predux(const Scalar& a) { return a; }
+template <typename Scalar> inline Scalar ei_predux(const Scalar& a) { return a; }
 
 #endif // EIGEN_PACKET_MATH_H
 
diff --git a/Eigen/src/Core/PacketMath_SSE.h b/Eigen/src/Core/PacketMath_SSE.h
index 1872affea..dd467f52a 100644
--- a/Eigen/src/Core/PacketMath_SSE.h
+++ b/Eigen/src/Core/PacketMath_SSE.h
@@ -102,14 +102,19 @@ inline int    ei_pfirst(const __m128i& a) { return _mm_cvtsi128_si32(a); }
 
 #ifdef __SSE3__
 // TODO implement SSE2 versions as well as integer versions
-inline __m128 ei_predux(const __m128* vecs)
+inline __m128 ei_preduxp(const __m128* vecs)
 {
   return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
 }
-inline __m128d ei_predux(const __m128d* vecs)
+inline __m128d ei_preduxp(const __m128d* vecs)
 {
   return _mm_hadd_pd(vecs[0], vecs[1]);
 }
+// SSSE3 version:
+// inline __m128i ei_preduxp(const __m128i* vecs)
+// {
+//   return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
+// }
 
 inline float ei_predux(const __m128& a)
 {
@@ -118,6 +123,13 @@ inline float ei_predux(const __m128& a)
 }
 
 inline double ei_predux(const __m128d& a) { return ei_pfirst(_mm_hadd_pd(a, a)); }
+
+// SSSE3 version:
+// inline float ei_predux(const __m128i& a)
+// {
+//   __m128i tmp0 = _mm_hadd_epi32(a,a);
+//   return ei_pfirst(_mm_hadd_epi32(tmp0, tmp0));
+// }
 #else
 // SSE2 versions
 inline float ei_predux(const __m128& a)
@@ -130,7 +142,7 @@ inline double ei_predux(const __m128d& a)
   return ei_pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
 }
 
-inline __m128 ei_predux(const __m128* vecs)
+inline __m128 ei_preduxp(const __m128* vecs)
 {
   __m128 tmp0, tmp1, tmp2;
   tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]);
@@ -144,11 +156,31 @@ inline __m128 ei_predux(const __m128* vecs)
   return _mm_add_ps(tmp0, tmp2);
 }
 
-inline __m128d ei_predux(const __m128d* vecs)
+inline __m128d ei_preduxp(const __m128d* vecs)
 {
   return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1]));
 }
 #endif  // SSE3
 
+inline int ei_predux(const __m128i& a)
+{
+  __m128i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
+  return ei_pfirst(tmp) + ei_pfirst(_mm_shuffle_epi32(tmp, 1));
+}
+
+inline __m128i ei_preduxp(const __m128i* vecs)
+{
+  __m128i tmp0, tmp1, tmp2;
+  tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
+  tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
+  tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
+  tmp0 = _mm_add_epi32(tmp0, tmp1);
+  tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
+  tmp1 = _mm_add_epi32(tmp1, tmp2);
+  tmp2 = _mm_unpacklo_epi64(tmp0, tmp1);
+  tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);
+  return _mm_add_epi32(tmp0, tmp2);
+}
+
 #endif // EIGEN_PACKET_MATH_SSE_H
 
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index c1b2f5457..66c6d4d5b 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -107,6 +107,12 @@ struct ei_packet_product_unroller<RowMajor, Index, Dynamic, Lhs, Rhs, PacketScal
   static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {}
 };
 
+template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
+struct ei_packet_product_unroller<false, Index, Dynamic, Lhs, Rhs, PacketScalar>
+{
+  static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {}
+};
+
 template<typename Product, bool RowMajor = true> struct ProductPacketCoeffImpl {
   inline static typename Product::PacketScalar execute(const Product& product, int row, int col)
   { return product._packetCoeffRowMajor(row,col); }
diff --git a/Eigen/src/Core/ProductWIP.h b/Eigen/src/Core/ProductWIP.h
index 57bb899d6..a5fc9d298 100644
--- a/Eigen/src/Core/ProductWIP.h
+++ b/Eigen/src/Core/ProductWIP.h
@@ -73,7 +73,7 @@ struct ei_packet_product_unroller<true, Index, Size, Lhs, Rhs, PacketScalar>
   static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
   {
     ei_packet_product_unroller<true, Index-1, Size, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
-    res =  ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.packetCoeff(Index, col), res);
+    res =  ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packetCoeff<Aligned>(Index, col), res);
   }
 };
 
@@ -83,7 +83,7 @@ struct ei_packet_product_unroller<false, Index, Size, Lhs, Rhs, PacketScalar>
   static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
   {
     ei_packet_product_unroller<false, Index-1, Size, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
-    res =  ei_pmadd(lhs.packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
+    res =  ei_pmadd(lhs.template packetCoeff<Aligned>(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
   }
 };
 
@@ -92,7 +92,7 @@ struct ei_packet_product_unroller<true, 0, Size, Lhs, Rhs, PacketScalar>
 {
   static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
   {
-    res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.packetCoeff(0, col));
+    res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packetCoeff<Aligned>(0, col));
   }
 };
 
@@ -101,7 +101,7 @@ struct ei_packet_product_unroller<false, 0, Size, Lhs, Rhs, PacketScalar>
 {
   static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
   {
-    res = ei_pmul(lhs.packetCoeff(row, 0), ei_pset1(rhs.coeff(0, col)));
+    res = ei_pmul(lhs.template packetCoeff<Aligned>(row, 0), ei_pset1(rhs.coeff(0, col)));
   }
 };
 
@@ -111,6 +111,12 @@ struct ei_packet_product_unroller<RowMajor, Index, Dynamic, Lhs, Rhs, PacketScal
   static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {}
 };
 
+template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
+struct ei_packet_product_unroller<false, Index, Dynamic, Lhs, Rhs, PacketScalar>
+{
+  static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {}
+};
+
 template<typename Product, bool RowMajor = true> struct ProductPacketCoeffImpl {
   inline static typename Product::PacketScalar execute(const Product& product, int row, int col)
   { return product._packetCoeffRowMajor(row,col); }
@@ -139,16 +145,53 @@ template<typename Lhs, typename Rhs> struct ei_product_eval_mode
 {
   enum{ value =  Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
               && Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-              && (!( (Lhs::Flags&RowMajorBit) && ((Rhs::Flags&RowMajorBit) ^ RowMajorBit)))
               ? CacheOptimalProduct : NormalProduct };
 };
 
+template<typename T> class ei_product_eval_to_column_major
+{
+    typedef typename ei_traits<T>::Scalar _Scalar;
+    enum {_MaxRows = ei_traits<T>::MaxRowsAtCompileTime,
+          _MaxCols = ei_traits<T>::MaxColsAtCompileTime,
+          _Flags = ei_traits<T>::Flags
+    };
+
+  public:
+    typedef Matrix<_Scalar,
+                  ei_traits<T>::RowsAtCompileTime,
+                  ei_traits<T>::ColsAtCompileTime,
+                  ei_corrected_matrix_flags<_Scalar, ei_size_at_compile_time<_MaxRows,_MaxCols>::ret, _Flags>::ret & ~RowMajorBit,
+                  ei_traits<T>::MaxRowsAtCompileTime,
+                  ei_traits<T>::MaxColsAtCompileTime> type;
+};
+
+template<typename T, int n=1> struct ei_product_nested_rhs
+{
+  typedef typename ei_meta_if<
+    ei_is_temporary<T>::ret && !(ei_traits<T>::Flags & RowMajorBit),
+    T,
+    typename ei_meta_if<
+         (ei_traits<T>::Flags & EvalBeforeNestingBit)
+      || (ei_traits<T>::Flags & RowMajorBit)
+      || (!(ei_traits<T>::Flags & ReferencableBit))
+      || (n+1) * NumTraits<typename ei_traits<T>::Scalar>::ReadCost < (n-1) * T::CoeffReadCost,
+      typename ei_product_eval_to_column_major<T>::type,
+      const T&
+    >::ret
+  >::ret type;
+};
+
 template<typename Lhs, typename Rhs, int EvalMode>
 struct ei_traits<Product<Lhs, Rhs, EvalMode> >
 {
   typedef typename Lhs::Scalar Scalar;
-  typedef typename ei_nested<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
-  typedef typename ei_nested<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
+  // the cache friendly product evals lhs once only
+  // FIXME what to do if we chose to dynamically call the normal product from the cache friendly one for small matrices ?
+  typedef typename ei_nested<Lhs, EvalMode==CacheOptimalProduct ? 0 : Rhs::ColsAtCompileTime>::type LhsNested;
+  // NOTE that rhs must be ColumnMajor, so we might need a special nested type calculation
+  typedef typename ei_meta_if<EvalMode==CacheOptimalProduct,
+      typename ei_product_nested_rhs<Rhs,Lhs::RowsAtCompileTime>::type,
+      typename ei_nested<Rhs,Lhs::RowsAtCompileTime>::type>::ret RhsNested;
   typedef typename ei_unref<LhsNested>::type _LhsNested;
   typedef typename ei_unref<RhsNested>::type _RhsNested;
   enum {
@@ -160,16 +203,20 @@ struct ei_traits<Product<Lhs, Rhs, EvalMode> >
     ColsAtCompileTime = Rhs::ColsAtCompileTime,
     MaxRowsAtCompileTime = Lhs::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = Rhs::MaxColsAtCompileTime,
+    // the vectorization flags are only used by the normal product,
+    // the other one is always vectorized !
     _RhsVectorizable = (RhsFlags & RowMajorBit) && (RhsFlags & VectorizableBit) && (ColsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
     _LhsVectorizable = (!(LhsFlags & RowMajorBit)) && (LhsFlags & VectorizableBit) && (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
-    _Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 1 : 0,
+    _Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 0 : 0,
     _RowMajor = (RhsFlags & RowMajorBit)
               && (EvalMode==(int)CacheOptimalProduct ? (int)LhsFlags & RowMajorBit : (!_LhsVectorizable)),
     _LostBits = DefaultLostFlagMask & ~(
                 (_RowMajor ? 0 : RowMajorBit)
               | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),
     Flags = ((unsigned int)(LhsFlags | RhsFlags) & _LostBits)
-//          | EvalBeforeAssigningBit //FIXME
+    #ifndef EIGEN_WIP_PRODUCT_DIRTY
+          | EvalBeforeAssigningBit //FIXME
+    #endif
           | EvalBeforeNestingBit
           | (_Vectorizable ? VectorizableBit : 0),
     CoeffReadCost
@@ -197,11 +244,16 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
       PacketSize = ei_packet_traits<Scalar>::size,
       #if (defined __i386__)
       // i386 architectures provides only 8 xmmm register,
-      // so let's reduce the max number of rows processed at once
+      // so let's reduce the max number of rows processed at once.
+      // NOTE that so far the maximal supported value is 8.
       MaxBlockRows = 4,
+      MaxBlockRows_ClampingMask = 0xFFFFFC,
       #else
       MaxBlockRows = 8,
+      MaxBlockRows_ClampingMask = 0xFFFFF8,
       #endif
+      // maximal size of the blocks fitted in L2 cache
+      MaxL2BlockSize = EIGEN_TUNE_FOR_L2_CACHE_SIZE / sizeof(Scalar)
     };
 
     Product(const Lhs& lhs, const Rhs& rhs)
@@ -214,16 +266,6 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
     template<typename DestDerived>
     void _cacheFriendlyEval(DestDerived& res) const;
 
-    /** \internal */
-    template<typename DestDerived, int RhsAlignment, int ResAlignment>
-    void _cacheFriendlyEvalImpl(DestDerived& res) const __attribute__ ((noinline));
-
-    /** \internal */
-    template<typename DestDerived, int RhsAlignment, int ResAlignment, int BlockRows>
-    void _cacheFriendlyEvalKernel(DestDerived& res,
-      int l2i, int l2j, int l2k, int l1i,
-      int l2blockRowEnd, int l2blockColEnd, int l2blockSizeEnd, const Scalar* block) const EIGEN_DONT_INLINE;
-
   private:
 
     int _rows() const { return m_lhs.rows(); }
@@ -255,7 +297,7 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
       if(Lhs::ColsAtCompileTime <= EIGEN_UNROLLING_LIMIT)
       {
         PacketScalar res;
-        ei_packet_product_unroller<Flags&RowMajorBit, Lhs::ColsAtCompileTime-1,
+        ei_packet_product_unroller<Flags&RowMajorBit ? true : false, Lhs::ColsAtCompileTime-1,
                             Lhs::ColsAtCompileTime <= EIGEN_UNROLLING_LIMIT
                               ? Lhs::ColsAtCompileTime : Dynamic,
                             _LhsNested, _RhsNested, PacketScalar>
@@ -269,21 +311,30 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
     PacketScalar _packetCoeffRowMajor(int row, int col) const
     {
       PacketScalar res;
-      res = ei_pmul(ei_pset1(m_lhs.coeff(row, 0)),m_rhs.packetCoeff(0, col));
+      res = ei_pmul(ei_pset1(m_lhs.coeff(row, 0)),m_rhs.template packetCoeff<Aligned>(0, col));
       for(int i = 1; i < m_lhs.cols(); i++)
-        res =  ei_pmadd(ei_pset1(m_lhs.coeff(row, i)), m_rhs.packetCoeff(i, col), res);
+        res =  ei_pmadd(ei_pset1(m_lhs.coeff(row, i)), m_rhs.template packetCoeff<Aligned>(i, col), res);
       return res;
     }
 
     PacketScalar _packetCoeffColumnMajor(int row, int col) const
     {
       PacketScalar res;
-      res = ei_pmul(m_lhs.packetCoeff(row, 0), ei_pset1(m_rhs.coeff(0, col)));
+      res = ei_pmul(m_lhs.template packetCoeff<Aligned>(row, 0), ei_pset1(m_rhs.coeff(0, col)));
       for(int i = 1; i < m_lhs.cols(); i++)
-        res =  ei_pmadd(m_lhs.packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col)), res);
+        res =  ei_pmadd(m_lhs.template packetCoeff<Aligned>(row, i), ei_pset1(m_rhs.coeff(i, col)), res);
       return res;
     }
 
+    /** \internal */
+    template<typename DestDerived, int RhsAlignment, int ResAlignment>
+    void _cacheFriendlyEvalImpl(DestDerived& res) const __attribute__ ((noinline));
+
+    /** \internal */
+    template<typename DestDerived, int RhsAlignment, int ResAlignment, int BlockRows>
+    void _cacheFriendlyEvalKernel(DestDerived& res,
+      int l2i, int l2j, int l2k, int l1i,
+      int l2blockRowEnd, int l2blockColEnd, int l2blockSizeEnd, const Scalar* block) const EIGEN_DONT_INLINE;
 
   protected:
     const LhsNested m_lhs;
@@ -361,7 +412,8 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalKernel(DestDerived& res,
     int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*BlockRows;
     const Scalar* localB = &block[offsetblock];
 
-    int l1jsize = l1j * m_lhs.cols(); //TODO find a better way to optimize address computation ?
+//     int l1jsize = l1j * m_lhs.cols(); //TODO find a better way to optimize address computation ?
+    Scalar* rhsColumn = &(m_rhs.const_cast_derived().coeffRef(0, l1j));
 
     // don't worry, dst is a set of registers
     PacketScalar dst[BlockRows];
@@ -383,7 +435,8 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalKernel(DestDerived& res,
 
     // unaligned loads are expensive, therefore let's preload the next element in advance
     if (RhsAlignment==UnAligned)
-      tmp1 = ei_ploadu(&m_rhs.derived().data()[l1jsize+l2k]);
+      //tmp1 = ei_ploadu(&m_rhs.data()[l1jsize+l2k]);
+      tmp1 = ei_ploadu(&rhsColumn[l2k]);
 
     for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
     {
@@ -392,13 +445,15 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalKernel(DestDerived& res,
       //PacketScalar tmp = m_rhs.template packetCoeff<Aligned>(k, l1j);
       if (RhsAlignment==Aligned)
       {
-        tmp = ei_pload(&m_rhs.data()[l1jsize + k]);
+        //tmp = ei_pload(&m_rhs.data()[l1jsize + k]);
+        tmp = ei_pload(&rhsColumn[k]);
       }
       else
       {
         tmp = tmp1;
         if (k+PacketSize<l2blockSizeEnd)
-          tmp1 = ei_ploadu(&m_rhs.data()[l1jsize + k+PacketSize]);
+          //tmp1 = ei_ploadu(&m_rhs.data()[l1jsize + k+PacketSize]);
+          tmp1 = ei_ploadu(&rhsColumn[k+PacketSize]);
       }
 
                         dst[0] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows             ])), dst[0]);
@@ -421,16 +476,16 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalKernel(DestDerived& res,
     // we have up to 4 packets (for doubles: 8 rows / 2)
     if (PacketRows>=1)
       res.template writePacketCoeff<Aligned>(l1i, l1j,
-        ei_padd(res.template packetCoeff<Aligned>(l1i, l1j), ei_predux(&(dst[0]))));
+        ei_padd(res.template packetCoeff<Aligned>(l1i, l1j), ei_preduxp(&(dst[0]))));
     if (PacketRows>=2)
       res.template writePacketCoeff<Aligned>(l1i+PacketSize, l1j,
-        ei_padd(res.template packetCoeff<Aligned>(l1i+PacketSize, l1j), ei_predux(&(dst[PacketSize]))));
+        ei_padd(res.template packetCoeff<Aligned>(l1i+PacketSize, l1j), ei_preduxp(&(dst[PacketSize]))));
     if (PacketRows>=3)
       res.template writePacketCoeff<Aligned>(l1i+2*PacketSize, l1j,
-        ei_padd(res.template packetCoeff<Aligned>(l1i+2*PacketSize, l1j), ei_predux(&(dst[2*PacketSize]))));
+        ei_padd(res.template packetCoeff<Aligned>(l1i+2*PacketSize, l1j), ei_preduxp(&(dst[2*PacketSize]))));
     if (PacketRows>=4)
       res.template writePacketCoeff<Aligned>(l1i+3*PacketSize, l1j,
-        ei_padd(res.template packetCoeff<Aligned>(l1i+3*PacketSize, l1j), ei_predux(&(dst[3*PacketSize]))));
+        ei_padd(res.template packetCoeff<Aligned>(l1i+3*PacketSize, l1j), ei_preduxp(&(dst[3*PacketSize]))));
 
     // process the remaining rows one at a time
     if (RemainingStart<=0 && BlockRows>=1) res.coeffRef(l1i+0, l1j) += ei_predux(dst[0]);
@@ -450,38 +505,37 @@ template<typename Lhs, typename Rhs, int EvalMode>
 template<typename DestDerived, int RhsAlignment, int ResAlignment>
 void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
 {
-  // allow direct access to data for benchmark purpose
-  const Scalar* __restrict__ a = m_lhs.derived().data();
-  const Scalar* __restrict__ b = m_rhs.derived().data();
-  Scalar* __restrict__ c = res.derived().data();
-
   // FIXME find a way to optimize: (an_xpr) + (a * b)
   // then we don't need to clear res and avoid and additional mat-mat sum
-//   res.setZero();
+  #ifndef EIGEN_WIP_PRODUCT_DIRTY
+//   std::cout << "wip product\n";
+  res.setZero();
+  #endif
 
-  const int bs = PacketSize * MaxBlockRows;                         // total number of elements treated at once
   const int rows = _rows();
   const int cols = _cols();
   const int remainingSize = m_lhs.cols()%PacketSize;
   const int size = m_lhs.cols() - remainingSize; // third dimension of the product clamped to packet boundaries
-  const int l2blocksize = 256 > _cols() ? _cols() : 256;
-  // FIXME use calloca ?? (allocation on the stack)
-  Scalar* __restrict__ block = new Scalar[l2blocksize*size];
+  const int l2BlockRows = MaxL2BlockSize > _rows() ? _rows() : MaxL2BlockSize;
+  const int l2BlockCols = MaxL2BlockSize > _cols() ? _cols() : MaxL2BlockSize;
+  const int l2BlockSize = MaxL2BlockSize > size    ? size    : MaxL2BlockSize;
+  //Scalar* __restrict__ block = new Scalar[l2blocksize*size];;
+  Scalar* __restrict__ block = (Scalar*)alloca(sizeof(Scalar)*l2BlockRows*size);
 
   // loops on each L2 cache friendly blocks of the result
-  for(int l2i=0; l2i<_rows(); l2i+=l2blocksize)
+  for(int l2i=0; l2i<_rows(); l2i+=l2BlockRows)
   {
-    const int l2blockRowEnd = std::min(l2i+l2blocksize, rows);
-    const int l2blockRowEndBW = l2blockRowEnd & 0xFFFFF8;             // end of the rows aligned to bw
-    const int l2blockRowRemaining = l2blockRowEnd - l2blockRowEndBW;  // number of remaining rows
+    const int l2blockRowEnd = std::min(l2i+l2BlockRows, rows);
+    const int l2blockRowEndBW = l2blockRowEnd & MaxBlockRows_ClampingMask;    // end of the rows aligned to bw
+    const int l2blockRemainingRows = l2blockRowEnd - l2blockRowEndBW;         // number of remaining rows
 
     // build a cache friendly block
     int count = 0;
 
     // copy l2blocksize rows of m_lhs to blocks of ps x bw
-    for(int l2k=0; l2k<size; l2k+=l2blocksize)
+    for(int l2k=0; l2k<size; l2k+=l2BlockSize)
     {
-      const int l2blockSizeEnd = std::min(l2k+l2blocksize, size);
+      const int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
 
       for (int i = l2i; i<l2blockRowEndBW; i+=MaxBlockRows)
       {
@@ -494,25 +548,25 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
               block[count++] = m_lhs.coeff(i+w,k+s);
         }
       }
-      if (l2blockRowRemaining>0)
+      if (l2blockRemainingRows>0)
       {
         for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
         {
-          for (int w=0; w<l2blockRowRemaining; ++w)
+          for (int w=0; w<l2blockRemainingRows; ++w)
             for (int s=0; s<PacketSize; ++s)
               block[count++] = m_lhs.coeff(l2blockRowEndBW+w,k+s);
         }
       }
     }
 
-    for(int l2j=0; l2j<cols; l2j+=l2blocksize)
+    for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
     {
-      int l2blockColEnd = std::min(l2j+l2blocksize, cols);
+      int l2blockColEnd = std::min(l2j+l2BlockCols, cols);
 
-      for(int l2k=0; l2k<size; l2k+=l2blocksize)
+      for(int l2k=0; l2k<size; l2k+=l2BlockSize)
       {
         // acumulate a bw rows of lhs time a single column of rhs to a bw x 1 block of res
-        int l2blockSizeEnd = std::min(l2k+l2blocksize, size);
+        int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
 
         // for each bw x 1 result's block
         for(int l1i=l2i; l1i<l2blockRowEndBW; l1i+=MaxBlockRows)
@@ -522,7 +576,7 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
 #if 0
           for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
           {
-            int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*bw/*bs*/;
+            int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*MaxBlockRows;
             const Scalar* localB = &block[offsetblock];
 
             int l1jsize = l1j * m_lhs.cols(); //TODO find a better way to optimize address computation ?
@@ -532,7 +586,7 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
             dst[1] = dst[0];
             dst[2] = dst[0];
             dst[3] = dst[0];
-            if (bw==8)
+            if (MaxBlockRows==8)
             {
               dst[4] = dst[0];
               dst[5] = dst[0];
@@ -543,7 +597,7 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
             // TODO in unaligned mode, preload the next element
 //             PacketScalar tmp1 = _mm_load_ps(&m_rhs.derived().data()[l1jsize+l2k]);
             asm("#eigen begincore");
-            for(int k=l2k; k<l2blockSizeEnd; k+=ps)
+            for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
             {
 //               PacketScalar tmp = m_rhs.template packetCoeff<Aligned>(k, l1j);
               // TODO make this branching compile time (costly for doubles)
@@ -559,10 +613,10 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
               dst[1] = ei_pmadd(tmp, b1, dst[1]);
               b1 = ei_pload(&(localB[k*bw+3*ps]));
               dst[2] = ei_pmadd(tmp, b0, dst[2]);
-              if (bw==8)
+              if (MaxBlockRows==8)
                 b0 = ei_pload(&(localB[k*bw+4*ps]));
               dst[3] = ei_pmadd(tmp, b1, dst[3]);
-              if (bw==8)
+              if (MaxBlockRows==8)
               {
                 b1 = ei_pload(&(localB[k*bw+5*ps]));
                 dst[4] = ei_pmadd(tmp, b0, dst[4]);
@@ -576,19 +630,20 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
 
 //             if (resIsAligned)
             {
-              res.template writePacketCoeff<Aligned>(l1i, l1j, ei_padd(res.template packetCoeff<Aligned>(l1i, l1j), ei_predux(dst)));
-              if (ps==2)
-                res.template writePacketCoeff<Aligned>(l1i+2,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+2,l1j), ei_predux(&(dst[2]))));
-              if (bw==8)
+              res.template writePacketCoeff<Aligned>(l1i, l1j, ei_padd(res.template packetCoeff<Aligned>(l1i, l1j), ei_preduxp(dst)));
+              if (PacketSize==2)
+                res.template writePacketCoeff<Aligned>(l1i+2,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+2,l1j), ei_preduxp(&(dst[2]))));
+              if (MaxBlockRows==8)
               {
-                res.template writePacketCoeff<Aligned>(l1i+4,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+4,l1j), ei_predux(&(dst[4]))));
-                if (ps==2)
-                  res.template writePacketCoeff<Aligned>(l1i+6,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+6,l1j), ei_predux(&(dst[6]))));
+                res.template writePacketCoeff<Aligned>(l1i+4,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+4,l1j), ei_preduxp(&(dst[4]))));
+                if (PacketSize==2)
+                  res.template writePacketCoeff<Aligned>(l1i+6,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+6,l1j), ei_preduxp(&(dst[6]))));
               }
             }
 //             else
 //             {
 //               // TODO uncommenting this code kill the perf, even though it is never called !!
+//               // this is because dst cannot be a set of registers only
 //               // TODO optimize this loop
 //               // TODO is it better to do one redux at once or packet reduxes + unaligned store ?
 //               for (int w = 0; w<bw; ++w)
@@ -600,15 +655,15 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
           }
 #endif
         }
-        if (l2blockRowRemaining>0)
+        if (l2blockRemainingRows>0)
         {
           // this is an attempt to build an array of kernels, but I did not manage to get it compiles
 //           typedef void (*Kernel)(DestDerived& , int, int, int, int, int, int, int, const Scalar*);
 //           Kernel kernels[8];
 //           kernels[0] = (Kernel)(&Product<Lhs,Rhs,EvalMode>::template _cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 1>);
-//           kernels[l2blockRowRemaining](res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block);
+//           kernels[l2blockRemainingRows](res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block);
 
-          switch(l2blockRowRemaining)
+          switch(l2blockRemainingRows)
           {
           case 1:_cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 1>(
               res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break;
@@ -627,43 +682,6 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
           default:
             ei_internal_assert(false && "internal error"); break;
           }
-
-#if 0
-          // TODO optimize this part using a generic templated function that processes N rows
-          // here we process the remaining l2blockRowRemaining rows
-          for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
-          {
-            int offsetblock = l2k * (l2blockRowEnd-l2i) + (l2blockRowEndBW-l2i)*(l2blockSizeEnd-l2k) - l2k*l2blockRowRemaining;
-            const Scalar* localB = &block[offsetblock];
-
-            int l1jsize = l1j * size;
-
-            PacketScalar dst[MaxBlockRows];
-            dst[0] = ei_pset1(Scalar(0.));
-            for (int w = 1; w<l2blockRowRemaining; ++w)
-              dst[w] = dst[0];
-            PacketScalar b0, b1, tmp;
-            asm("#eigen begincore dynamic");
-            for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
-            {
-              //PacketScalar tmp = m_rhs.packetCoeff(k, l1j);
-              if (rhsIsAligned)
-                tmp = ei_pload(&m_rhs.derived().data()[l1jsize + k]);
-              else
-                tmp = ei_ploadu(&m_rhs.derived().data()[l1jsize + k]);
-
-              // TODO optimize this loop
-              for (int w = 0; w<l2blockRowRemaining; ++w)
-                dst[w] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRowRemaining+w*PacketSize])), dst[w]);
-            }
-
-            // TODO optimize this loop
-            for (int w = 0; w<l2blockRowRemaining; ++w)
-              res.coeffRef(l2blockRowEndBW+w, l1j) += ei_predux(dst[w]);
-
-            asm("#eigen endcore dynamic");
-          }
-#endif
         }
       }
     }
@@ -678,9 +696,10 @@ void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
       NormalProduct>(
         m_lhs.block(0,size, _rows(), remainingSize),
         m_rhs.block(size,0, remainingSize, _cols())).lazy();
+//     res += m_lhs.block(0,size, _rows(), remainingSize)._lazyProduct(m_rhs.block(size,0, remainingSize, _cols()));
   }
 
-  delete[] block;
+//   delete[] block;
 }
 
 #endif // EIGEN_PRODUCT_H
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index c74d8f87b..48498bfae 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -43,16 +43,18 @@ const unsigned int NullDiagBit = 0x40;      ///< means all diagonal coefficients
 const unsigned int UnitDiagBit = 0x80;      ///< means all diagonal coefficients are equal to 1
 const unsigned int NullLowerBit = 0x200;    ///< means the strictly triangular lower part is 0
 const unsigned int NullUpperBit = 0x400;    ///< means the strictly triangular upper part is 0
+const unsigned int ReferencableBit = 0x800; ///< means the expression is writable through MatrixBase::coeffRef(int,int)
 
 enum { Upper=NullLowerBit, Lower=NullUpperBit };
 enum { Aligned=0, UnAligned=1 };
 
 // list of flags that are lost by default
-const unsigned int DefaultLostFlagMask = ~(VectorizableBit | Like1DArrayBit | NullDiagBit | UnitDiagBit | NullLowerBit | NullUpperBit);
+const unsigned int DefaultLostFlagMask = ~(VectorizableBit | Like1DArrayBit | ReferencableBit
+  | NullDiagBit | UnitDiagBit | NullLowerBit | NullUpperBit);
 
 enum { ConditionalJumpCost = 5 };
 enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };
 enum DirectionType { Vertical, Horizontal };
-enum ProductEvaluationMode { NormalProduct, CacheOptimalProduct };
+enum ProductEvaluationMode { NormalProduct, CacheOptimalProduct, LazyProduct};
 
 #endif // EIGEN_CONSTANTS_H
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 613ab9617..f703d159a 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -45,6 +45,13 @@
 
 #define EIGEN_DEFAULT_MATRIX_FLAGS EIGEN_DEFAULT_MATRIX_STORAGE_ORDER
 
+/** Define a hint size when dealling with large matrices and L2 cache friendlyness
+  * More precisely, its square value represents the amount of bytes which can be assumed to stay in L2 cache.
+  */
+#ifndef EIGEN_TUNE_FOR_L2_CACHE_SIZE
+#define EIGEN_TUNE_FOR_L2_CACHE_SIZE 1024
+#endif
+
 #define USING_PART_OF_NAMESPACE_EIGEN \
 EIGEN_USING_MATRIX_TYPEDEFS \
 using Eigen::Matrix; \