From 22a816ade884e3eee6a2f6a25f342e6a0d3448a0 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 19 Jul 2008 00:09:01 +0000
Subject: [PATCH] * Fix a couple of issues related to the recent cache friendly
 products * Improve the efficiency of matrix*vector in unaligned cases *
 Trivial fixes in the destructors of MatrixStorage * Removed the matrixNorm in
 test/product.cpp (twice faster and   that assumed the matrix product was ok
 while checking that !!)

---
 Eigen/src/Core/CacheFriendlyProduct.h | 112 +++++++++++++++-----------
 Eigen/src/Core/DummyPacketMath.h      |   2 +-
 Eigen/src/Core/Map.h                  |  36 +--------
 Eigen/src/Core/Matrix.h               |   3 +-
 Eigen/src/Core/MatrixStorage.h        |   6 +-
 Eigen/src/Core/Product.h              |   4 +-
 Eigen/src/Core/arch/SSE/PacketMath.h  |   9 +++
 test/map.cpp                          |   1 -
 test/product.cpp                      |   4 +-
 9 files changed, 85 insertions(+), 92 deletions(-)
diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h
index 669649de8..83f5da558 100644
--- a/Eigen/src/Core/CacheFriendlyProduct.h
+++ b/Eigen/src/Core/CacheFriendlyProduct.h
@@ -381,14 +381,14 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
     ei_pstore(&res[j OFFSET], \
       ei_padd(ei_pload(&res[j OFFSET]), \
         ei_padd( \
-          ei_padd(ei_pmul(ptmp0,ei_pload ## A0(&lhs[j OFFSET +iN0])),ei_pmul(ptmp1,ei_pload ## A13(&lhs[j OFFSET +iN1]))), \
-          ei_padd(ei_pmul(ptmp2,ei_pload ## A2(&lhs[j OFFSET +iN2])),ei_pmul(ptmp3,ei_pload ## A13(&lhs[j OFFSET +iN3]))) )))
+          ei_padd(ei_pmul(ptmp0,ei_pload ## A0(&lhs0[j OFFSET])),ei_pmul(ptmp1,ei_pload ## A13(&lhs1[j OFFSET]))), \
+          ei_padd(ei_pmul(ptmp2,ei_pload ## A2(&lhs2[j OFFSET])),ei_pmul(ptmp3,ei_pload ## A13(&lhs3[j OFFSET]))) )))
 
   asm("#begin matrix_vector_product");
   typedef typename ei_packet_traits<Scalar>::type Packet;
   const int PacketSize = sizeof(Packet)/sizeof(Scalar);
 
-  enum { AllAligned, EvenAligned, FirstAligned, NoneAligned };
+  enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
   const int columnsAtOnce = 4;
   const int peels = 2;
   const int PacketAlignedMask = PacketSize-1;
@@ -401,9 +401,9 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
                          ? std::min<int>( (PacketSize - ((size_t(res)/sizeof(Scalar)) & PacketAlignedMask)) & PacketAlignedMask, size)
                          : 0;
   const int alignedSize = alignedStart + ((size-alignedStart) & ~PacketAlignedMask);
-  const int peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : 0;
+  const int peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
 
-  const int alignmentStep = lhsStride % PacketSize;
+  const int alignmentStep = (PacketSize - lhsStride % PacketSize) & PacketAlignedMask;
   int alignmentPattern = alignmentStep==0 ? AllAligned
                        : alignmentStep==2 ? EvenAligned
                        : FirstAligned;
@@ -423,17 +423,17 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
     skipColumns = std::min(skipColumns,rhs.size());
     // note that the skiped columns are processed later.
   }
-
-  int columnBound = (rhs.size()/columnsAtOnce)*columnsAtOnce;
+  int columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
   for (int i=skipColumns; i<columnBound; i+=columnsAtOnce)
   {
-    Scalar tmp0  = rhs[i], tmp1 = rhs[i+1], tmp2 = rhs[i+2], tmp3 = rhs[i+3];
-    Packet ptmp0 = ei_pset1(tmp0), ptmp1 = ei_pset1(tmp1), ptmp2 = ei_pset1(tmp2), ptmp3 = ei_pset1(tmp3);
-    int iN0 = i*lhsStride, iN1 = (i+1)*lhsStride, iN2 = (i+2)*lhsStride, iN3 = (i+3)*lhsStride;
+    Packet ptmp0 = ei_pset1(rhs[i]),   ptmp1 = ei_pset1(rhs[i+1]),
+           ptmp2 = ei_pset1(rhs[i+2]), ptmp3 = ei_pset1(rhs[i+3]);
+    const Scalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+1)*lhsStride,
+                 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+3)*lhsStride;
 
     // process initial unaligned coeffs
     for (int j=0; j<alignedStart; j++)
-      res[j] += tmp0 * lhs[j+iN0] + tmp1 * lhs[j+iN1] + tmp2 * lhs[j+iN2] + tmp3 * lhs[j+iN3];
+      res[j] += ei_pfirst(ptmp0)*lhs0[j] + ei_pfirst(ptmp1)*lhs1[j] + ei_pfirst(ptmp2)*lhs2[j] + ei_pfirst(ptmp3)*lhs3[j];
 
     if (alignedSize>alignedStart)
     {
@@ -448,14 +448,29 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
             _EIGEN_ACCUMULATE_PACKETS(,u,,);
           break;
         case FirstAligned:
-          if (peels>1)
+          if(peels>1)
+          {
+            // NOTE peeling with two _EIGEN_ACCUMULATE_PACKETS() is much less efficient
+            // than the following code
+            asm("#mybegin");
+            Packet A00, A01, A02, A03, A10, A11, A12, A13;
             for (int j = alignedStart; j<peeledSize; j+=peels*PacketSize)
             {
-              _EIGEN_ACCUMULATE_PACKETS(,u,u,);
-              _EIGEN_ACCUMULATE_PACKETS(,u,u,+PacketSize);
-              if (peels>2) _EIGEN_ACCUMULATE_PACKETS(,u,u,+2*PacketSize);
-              if (peels>3) _EIGEN_ACCUMULATE_PACKETS(,u,u,+3*PacketSize);
+              A01 = ei_ploadu(&lhs1[j]);          A11 = ei_ploadu(&lhs1[j+PacketSize]);
+              A02 = ei_ploadu(&lhs2[j]);          A12 = ei_ploadu(&lhs2[j+PacketSize]);
+              A00 = ei_pload (&lhs0[j]);          A10 = ei_pload (&lhs0[j+PacketSize]);
+
+              A00 = ei_pmadd(ptmp0, A00, ei_pload(&res[j]));
+              A10 = ei_pmadd(ptmp0, A10, ei_pload(&res[j+PacketSize]));
+
+              A00 = ei_pmadd(ptmp1, A01, A00);    A10 = ei_pmadd(ptmp1, A11, A10);
+              A03 = ei_ploadu(&lhs3[j]);          A13 = ei_ploadu(&lhs3[j+PacketSize]);
+              A00 = ei_pmadd(ptmp2, A02, A00);    A10 = ei_pmadd(ptmp2, A12, A10);
+              A00 = ei_pmadd(ptmp3, A03, A00);    A10 = ei_pmadd(ptmp3, A13, A10);
+              ei_pstore(&res[j],A00);             ei_pstore(&res[j+PacketSize],A10);
             }
+            asm("#myend");
+          }
           for (int j = peeledSize; j<alignedSize; j+=PacketSize)
             _EIGEN_ACCUMULATE_PACKETS(,u,u,);
           break;
@@ -468,7 +483,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
 
     // process remaining coeffs
     for (int j=alignedSize; j<size; j++)
-      res[j] += tmp0 * lhs[j+iN0] + tmp1 * lhs[j+iN1] + tmp2 * lhs[j+iN2] + tmp3 * lhs[j+iN3];
+      res[j] += ei_pfirst(ptmp0)*lhs0[j] + ei_pfirst(ptmp1)*lhs1[j] + ei_pfirst(ptmp2)*lhs2[j] + ei_pfirst(ptmp3)*lhs3[j];
   }
 
   // process remaining first and last columns (at most columnsAtOnce-1)
@@ -476,26 +491,25 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
   int start = columnBound;
   do
   {
-    for (int i=columnBound; i<end; i++)
+    for (int i=start; i<end; i++)
     {
-      Scalar tmp0 = rhs[i];
-      Packet ptmp0 = ei_pset1(tmp0);
-      int iN0 = i*lhsStride;
+      Packet ptmp0 = ei_pset1(rhs[i]);
+      const Scalar* lhs0 = lhs + i*lhsStride;
       // process first unaligned result's coeffs
       for (int j=0; j<alignedStart; j++)
-        res[j] += tmp0 * lhs[j+iN0];
+        res[j] += ei_pfirst(ptmp0) * lhs0[j];
 
       // process aligned result's coeffs
-      if ((iN0 % PacketSize) == 0)
+      if (((i*lhsStride) % PacketSize+alignedStart) == 0)
         for (int j = alignedStart;j<alignedSize;j+=PacketSize)
-          ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_pload(&lhs[j+iN0])),ei_pload(&res[j])));
+          ei_pstore(&res[j], ei_pmadd(ptmp0,ei_pload(&lhs0[j]),ei_pload(&res[j])));
       else
         for (int j = alignedStart;j<alignedSize;j+=PacketSize)
-          ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_ploadu(&lhs[j+iN0])),ei_pload(&res[j])));
+          ei_pstore(&res[j], ei_pmadd(ptmp0,ei_ploadu(&lhs0[j]),ei_pload(&res[j])));
 
       // process remaining scalars
       for (int j=alignedSize; j<size; j++)
-        res[j] += tmp0 * lhs[j+iN0];
+        res[j] += ei_pfirst(ptmp0) * lhs0[j];
     }
     if (skipColumns)
     {
@@ -507,10 +521,11 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
       break;
   } while(true);
   asm("#end matrix_vector_product");
-
   #undef _EIGEN_ACCUMULATE_PACKETS
 }
 
+
+// TODO add peeling to mask unaligned load/stores
 template<typename Scalar, typename ResType>
 EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
   const Scalar* lhs, int lhsStride,
@@ -523,10 +538,10 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
 
   #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2,OFFSET) {\
     Packet b = ei_pload(&rhs[j]); \
-    ptmp0 = ei_padd(ptmp0, ei_pmul(b, ei_pload##A0 (&lhs[j+iN0]))); \
-    ptmp1 = ei_padd(ptmp1, ei_pmul(b, ei_pload##A13(&lhs[j+iN1]))); \
-    ptmp2 = ei_padd(ptmp2, ei_pmul(b, ei_pload##A2 (&lhs[j+iN2]))); \
-    ptmp3 = ei_padd(ptmp3, ei_pmul(b, ei_pload##A13(&lhs[j+iN3]))); }
+    ptmp0 = ei_pmadd(b, ei_pload##A0 (&lhs0[j]), ptmp0); \
+    ptmp1 = ei_pmadd(b, ei_pload##A13(&lhs1[j]), ptmp1); \
+    ptmp2 = ei_pmadd(b, ei_pload##A2 (&lhs2[j]), ptmp2); \
+    ptmp3 = ei_pmadd(b, ei_pload##A13(&lhs3[j]), ptmp3); }
 
   asm("#begin matrix_vector_product");
   typedef typename ei_packet_traits<Scalar>::type Packet;
@@ -534,9 +549,9 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
 
   enum { AllAligned, EvenAligned, FirstAligned, NoneAligned };
   const int rowsAtOnce = 4;
-  const int peels = 2;
+//   const int peels = 2;
   const int PacketAlignedMask = PacketSize-1;
-  const int PeelAlignedMask = PacketSize*peels-1;
+//   const int PeelAlignedMask = PacketSize*peels-1;
   const bool Vectorized = sizeof(Packet) != sizeof(Scalar);
   const int size = rhsSize;
 
@@ -546,9 +561,9 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
                          ? std::min<int>( (PacketSize - ((size_t(rhs)/sizeof(Scalar)) & PacketAlignedMask)) & PacketAlignedMask, size)
                          : 0;
   const int alignedSize = alignedStart + ((size-alignedStart) & ~PacketAlignedMask);
-  const int peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : 0;
+  //const int peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : 0;
 
-  const int alignmentStep = lhsStride % PacketSize;
+  const int alignmentStep = (PacketSize - lhsStride % PacketSize) & PacketAlignedMask;
   int alignmentPattern = alignmentStep==0 ? AllAligned
                        : alignmentStep==2 ? EvenAligned
                        : FirstAligned;
@@ -568,19 +583,20 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
     skipRows = std::min(skipRows,res.size());
     // note that the skiped columns are processed later.
   }
-
-  int rowBound = (res.size()/rowsAtOnce)*rowsAtOnce;
+  
+  int rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
   for (int i=skipRows; i<rowBound; i+=rowsAtOnce)
   {
     Scalar tmp0 = Scalar(0), tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0);
     Packet ptmp0 = ei_pset1(Scalar(0)), ptmp1 = ei_pset1(Scalar(0)), ptmp2 = ei_pset1(Scalar(0)), ptmp3 = ei_pset1(Scalar(0));
-    int iN0 = i*lhsStride, iN1 = (i+1)*lhsStride, iN2 = (i+2)*lhsStride, iN3 = (i+3)*lhsStride;
+    const Scalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+1)*lhsStride,
+                 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+3)*lhsStride;
 
     // process initial unaligned coeffs
     for (int j=0; j<alignedStart; j++)
     {
       Scalar b = rhs[j];
-      tmp0 += b * lhs[j+iN0]; tmp1 += b * lhs[j+iN1]; tmp2 += b * lhs[j+iN2]; tmp3 += b * lhs[j+iN3];
+      tmp0 += b*lhs0[j]; tmp1 += b*lhs1[j]; tmp2 += b*lhs2[j]; tmp3 += b*lhs3[j];
     }
 
     if (alignedSize>alignedStart)
@@ -614,9 +630,9 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
     for (int j=alignedSize; j<size; j++)
     {
       Scalar b = rhs[j];
-      tmp0 += b * lhs[j+iN0]; tmp1 += b * lhs[j+iN1]; tmp2 += b * lhs[j+iN2]; tmp3 += b * lhs[j+iN3];
+      tmp0 += b*lhs0[j]; tmp1 += b*lhs1[j]; tmp2 += b*lhs2[j]; tmp3 += b*lhs3[j];
     }
-    res[i] = tmp0; res[i+1] = tmp1; res[i+2] = tmp2; res[i+3] = tmp3;
+    res[i] += tmp0; res[i+1] += tmp1; res[i+2] += tmp2; res[i+3] += tmp3;
   }
 
   // process remaining first and last rows (at most columnsAtOnce-1)
@@ -624,30 +640,30 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
   int start = rowBound;
   do
   {
-    for (int i=rowBound; i<end; i++)
+    for (int i=start; i<end; i++)
     {
       Scalar tmp0 = Scalar(0);
       Packet ptmp0 = ei_pset1(tmp0);
-      int iN0 = i*lhsStride;
+      const Scalar* lhs0 = lhs + i*lhsStride;
       // process first unaligned result's coeffs
       for (int j=0; j<alignedStart; j++)
-        tmp0 += rhs[j] * lhs[j+iN0];
+        tmp0 += rhs[j] * lhs0[j];
 
       if (alignedSize>alignedStart)
       {
         // process aligned rhs coeffs
-        if (iN0 % PacketSize==0)
+        if ((i*lhsStride+alignedStart) % PacketSize==0)
           for (int j = alignedStart;j<alignedSize;j+=PacketSize)
-            ptmp0 = ei_padd(ptmp0, ei_pmul(ei_pload(&rhs[j]), ei_pload(&lhs[j+iN0])));
+            ptmp0 = ei_pmadd(ei_pload(&rhs[j]), ei_pload(&lhs0[j]), ptmp0);
         else
           for (int j = alignedStart;j<alignedSize;j+=PacketSize)
-            ptmp0 = ei_padd(ptmp0, ei_pmul(ei_pload(&rhs[j]), ei_ploadu(&lhs[j+iN0])));
+            ptmp0 = ei_pmadd(ei_pload(&rhs[j]), ei_ploadu(&lhs0[j]), ptmp0);
         tmp0 += ei_predux(ptmp0);
       }
 
       // process remaining scalars
       for (int j=alignedSize; j<size; j++)
-        tmp0 += rhs[j] * lhs[j+iN0];
+        tmp0 += rhs[j] * lhs0[j];
       res[i] += tmp0;
     }
     if (skipRows)
diff --git a/Eigen/src/Core/DummyPacketMath.h b/Eigen/src/Core/DummyPacketMath.h
index 1b3bae34f..1a4ecbfa0 100644
--- a/Eigen/src/Core/DummyPacketMath.h
+++ b/Eigen/src/Core/DummyPacketMath.h
@@ -117,7 +117,7 @@ template <typename Scalar, typename Packet, int LoadMode> inline void ei_pstoret
   if(LoadMode == Aligned)
     ei_pstore(to, from);
   else
-    ei_pstoreu(to, from);  
+    ei_pstoreu(to, from);
 }
 
 #endif // EIGEN_DUMMY_PACKET_MATH_H
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index 92ac5f20b..13664eebc 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -66,6 +66,8 @@ template<typename MatrixType, int Alignment> class Map
     inline int rows() const { return m_rows.value(); }
     inline int cols() const { return m_cols.value(); }
 
+    inline int stride() const { return this->innerSize(); }
+
     inline const Scalar& coeff(int row, int col) const
     {
       if(Flags & RowMajorBit)
@@ -156,40 +158,6 @@ template<typename MatrixType, int Alignment> class Map
     const ei_int_if_dynamic<ColsAtCompileTime> m_cols;
 };
 
-/** Constructor copying an existing array of data. Only useful for dynamic-size matrices:
-  * for fixed-size matrices, it is redundant to pass the \a rows and \a cols parameters.
-  * \param data The array of data to copy
-  * \param rows The number of rows of the matrix to construct
-  * \param cols The number of columns of the matrix to construct
-  *
-  * \sa Matrix(const Scalar *), Matrix::map(const Scalar *, int, int)
-  */
-template<typename _Scalar, int _Rows, int _Cols, int _MaxRows, int _MaxCols, unsigned int _Flags>
-inline Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCols, _Flags>
-  ::Matrix(const Scalar *data, int rows, int cols)
-  : m_storage(rows*cols, rows, cols)
-{
-  *this = Map<Matrix>(data, rows, cols);
-}
-
-/** Constructor copying an existing array of data. Only useful for dynamic-size vectors:
-  * for fixed-size vectors, it is redundant to pass the \a size parameter.
-  *
-  * \only_for_vectors
-  *
-  * \param data The array of data to copy
-  * \param size The size of the vector to construct
-  *
-  * \sa Matrix(const Scalar *), Matrix::map(const Scalar *, int)
-  */
-template<typename _Scalar, int _Rows, int _Cols, int _MaxRows, int _MaxCols, unsigned int _Flags>
-inline Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCols, _Flags>
-  ::Matrix(const Scalar *data, int size)
-  : m_storage(size, RowsAtCompileTime == 1 ? 1 : size, ColsAtCompileTime == 1 ? 1 : size)
-{
-  *this = Map<Matrix>(data, size);
-}
-
 /** Constructor copying an existing array of data.
   * Only for fixed-size matrices and vectors.
   * \param data The array of data to copy
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index 379b5e1f4..569d7207c 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -319,8 +319,7 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCol
       m_storage.data()[2] = z;
       m_storage.data()[3] = w;
     }
-    Matrix(const Scalar *data, int rows, int cols);
-    Matrix(const Scalar *data, int size);
+
     explicit Matrix(const Scalar *data);
 
     /** Constructor copying the value of the expression \a other */
diff --git a/Eigen/src/Core/MatrixStorage.h b/Eigen/src/Core/MatrixStorage.h
index 53faf884e..2b13018bf 100644
--- a/Eigen/src/Core/MatrixStorage.h
+++ b/Eigen/src/Core/MatrixStorage.h
@@ -161,7 +161,7 @@ template<typename T> class ei_matrix_storage<T, Dynamic, Dynamic, Dynamic>
   public:
     inline ei_matrix_storage(int size, int rows, int cols)
       : m_data(ei_aligned_malloc<T>(size)), m_rows(rows), m_cols(cols) {}
-    inline ~ei_matrix_storage() { delete[] m_data; }
+    inline ~ei_matrix_storage() { ei_aligned_free(m_data); }
     inline void swap(ei_matrix_storage& other)
     { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
     inline int rows(void) const {return m_rows;}
@@ -187,7 +187,7 @@ template<typename T, int _Rows> class ei_matrix_storage<T, Dynamic, _Rows, Dynam
     int m_cols;
   public:
     inline ei_matrix_storage(int size, int, int cols) : m_data(ei_aligned_malloc<T>(size)), m_cols(cols) {}
-    inline ~ei_matrix_storage() { delete[] m_data; }
+    inline ~ei_matrix_storage() { ei_aligned_free(m_data); }
     inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
     inline static int rows(void) {return _Rows;}
     inline int cols(void) const {return m_cols;}
@@ -211,7 +211,7 @@ template<typename T, int _Cols> class ei_matrix_storage<T, Dynamic, Dynamic, _Co
     int m_rows;
   public:
     inline ei_matrix_storage(int size, int rows, int) : m_data(ei_aligned_malloc<T>(size)), m_rows(rows) {}
-    inline ~ei_matrix_storage() { delete[] m_data; }
+    inline ~ei_matrix_storage() { ei_aligned_free(m_data); }
     inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
     inline int rows(void) const {return m_rows;}
     inline static int cols(void) {return _Cols;}
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 223da4d06..831611adf 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -94,8 +94,8 @@ template<typename Lhs, typename Rhs> struct ei_product_mode
           : Lhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
             && ( Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
               || Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD )
-            && (!(Rhs::IsVectorAtCompileTime && (Lhs::Flags&RowMajorBit)  && (!Lhs::Flags&DirectAccessBit)))
-            && (!(Lhs::IsVectorAtCompileTime && (!Rhs::Flags&RowMajorBit) && (!Rhs::Flags&DirectAccessBit)))
+            && (!(Rhs::IsVectorAtCompileTime && (Lhs::Flags&RowMajorBit)  && (!(Lhs::Flags&DirectAccessBit))))
+            && (!(Lhs::IsVectorAtCompileTime && (!(Rhs::Flags&RowMajorBit)) && (!(Rhs::Flags&DirectAccessBit))))
           ? CacheFriendlyProduct
           : NormalProduct };
 };
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index a30011dea..114ed49e2 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -189,5 +189,14 @@ inline __m128i ei_preduxp(const __m128i* vecs)
   return _mm_add_epi32(tmp0, tmp2);
 }
 
+#if (defined __GNUC__)
+template <> inline __m128 ei_pmadd(const __m128&  a, const __m128&  b, const __m128&  c)
+{
+  __m128 res = b;
+  asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));
+  return res;
+}
+#endif
+
 #endif // EIGEN_PACKET_MATH_SSE_H
 
diff --git a/test/map.cpp b/test/map.cpp
index 1723a047c..39e40af15 100644
--- a/test/map.cpp
+++ b/test/map.cpp
@@ -38,7 +38,6 @@ template<typename VectorType> void tmap(const VectorType& m)
   VectorType ma1 = Map<VectorType>(array1, size);
   VectorType ma2 = Map<VectorType, Aligned>(array2, size);
   VERIFY_IS_APPROX(ma1, ma2);
-  VERIFY_IS_APPROX(ma1, VectorType(array2, size));
   ei_aligned_free(array1);
   ei_aligned_free(array2);
 }
diff --git a/test/product.cpp b/test/product.cpp
index 50ec64d4d..48999119f 100644
--- a/test/product.cpp
+++ b/test/product.cpp
@@ -23,12 +23,14 @@
 // Eigen. If not, see <http://www.gnu.org/licenses/>.
 
 #include "main.h"
+#include <Eigen/Array>
 #include <Eigen/QR>
 
 template<typename Derived1, typename Derived2>
 bool areNotApprox(const MatrixBase<Derived1>& m1, const MatrixBase<Derived2>& m2, typename Derived1::RealScalar epsilon = precision<typename Derived1::RealScalar>())
 {
-  return !((m1-m2).matrixNorm() < epsilon * std::max(m1.matrixNorm(), m2.matrixNorm()));
+  return !((m1-m2).cwise().abs2().maxCoeff() < epsilon * epsilon
+                          * std::max(m1.cwise().abs2().maxCoeff(), m2.cwise().abs2().maxCoeff()));
 }
 
 template<typename MatrixType> void product(const MatrixType& m)