From d41577819bddb5ca734acc3ba0697646475dc786 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Mon, 5 Oct 2009 10:11:11 -0400
Subject: [PATCH] we were already aligning to 16 byte boundary fixed-size
 objects that are multiple of 16 bytes; now we also align to 8byte boundary
 fixed-size objects that are multiple of 8 bytes. That's only useful for now
 for double, not e.g. for Vector2f, but that didn't seem to hurt. Am I missing
 something? Do you prefer that we don't align Vector2f at all? Also,
 improvements in test_unalignedassert.

---
 Eigen/src/Core/MatrixStorage.h           | 52 ++++++++-----
 Eigen/src/Core/arch/AltiVec/PacketMath.h |  6 +-
 Eigen/src/Core/arch/SSE/PacketMath.h     |  6 +-
 Eigen/src/Core/util/Macros.h             | 16 ++--
 test/packetmath.cpp                      | 14 ++--
 test/unalignedassert.cpp                 | 94 +++++++++++++-----------
 6 files changed, 108 insertions(+), 80 deletions(-)
diff --git a/Eigen/src/Core/MatrixStorage.h b/Eigen/src/Core/MatrixStorage.h
index f67095d0c..654fdf5e6 100644
--- a/Eigen/src/Core/MatrixStorage.h
+++ b/Eigen/src/Core/MatrixStorage.h
@@ -29,32 +29,48 @@
 struct ei_constructor_without_unaligned_array_assert {};
 
 /** \internal
-  * Static array automatically aligned if the total byte size is a multiple of 16 and the matrix options require auto alignment
+  * Static array. If the MatrixOptions require auto-alignment, and the array will be automatically aligned:
+  *  - to 16 bytes boundary, if the total size is a multiple of 16 bytes;
+  *  - or else to 8 bytes boundary, if the total size is a multiple of 8 bytes.
   */
 template <typename T, int Size, int MatrixOptions,
-          bool Align = (!(MatrixOptions&DontAlign)) && (((Size*sizeof(T))&0xf)==0)
-> struct ei_matrix_array
-{
-  EIGEN_ALIGN_128 T array[Size];
-
-  ei_matrix_array()
-  {
-    #ifndef EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
-    ei_assert((reinterpret_cast<size_t>(array) & 0xf) == 0
-              && "this assertion is explained here: http://eigen.tuxfamily.org/dox/UnalignedArrayAssert.html  **** READ THIS WEB PAGE !!! ****");
-    #endif
-  }
-
-  ei_matrix_array(ei_constructor_without_unaligned_array_assert) {}
-};
-
-template <typename T, int Size, int MatrixOptions> struct ei_matrix_array<T,Size,MatrixOptions,false>
+          int Alignment = (MatrixOptions&DontAlign) ? 0
+                        : (((Size*sizeof(T))%16)==0) ? 16
+                        : (((Size*sizeof(T))%8)==0) ? 8
+                        : 0 >
+struct ei_matrix_array
 {
   T array[Size];
   ei_matrix_array() {}
   ei_matrix_array(ei_constructor_without_unaligned_array_assert) {}
 };
 
+#ifdef EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
+  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)
+#else
+  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
+    ei_assert((reinterpret_cast<size_t>(array) & sizemask) == 0 \
+              && "this assertion is explained here: " \
+              "http://eigen.tuxfamily.org/dox/UnalignedArrayAssert.html" \
+              " **** READ THIS WEB PAGE !!! ****");
+#endif
+
+template <typename T, int Size, int MatrixOptions>
+struct ei_matrix_array<T, Size, MatrixOptions, 16>
+{
+  EIGEN_ALIGN16 T array[Size];
+  ei_matrix_array() { EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(0xf) }
+  ei_matrix_array(ei_constructor_without_unaligned_array_assert) {}
+};
+
+template <typename T, int Size, int MatrixOptions>
+struct ei_matrix_array<T, Size, MatrixOptions, 8>
+{
+  EIGEN_ALIGN8 T array[Size];
+  ei_matrix_array() { EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(0x7) }
+  ei_matrix_array(ei_constructor_without_unaligned_array_assert) {}
+};
+
 /** \internal
   *
   * \class ei_matrix_storage
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index a9c16200e..1526a4b97 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -265,14 +265,14 @@ template<> inline void ei_pstoreu(int*    to , const v4i&    from )
 
 template<> inline float  ei_pfirst(const v4f&  a)
 {
-  float EIGEN_ALIGN_128 af[4];
+  float EIGEN_ALIGN16 af[4];
   vec_st(a, 0, af);
   return af[0];
 }
 
 template<> inline int    ei_pfirst(const v4i&  a)
 {
-  int EIGEN_ALIGN_128 ai[4];
+  int EIGEN_ALIGN16 ai[4];
   vec_st(a, 0, ai);
   return ai[0];
 }
@@ -373,7 +373,7 @@ inline float ei_predux_mul(const v4f& a)
 
 inline int ei_predux_mul(const v4i& a)
 {
-  EIGEN_ALIGN_128 int aux[4];
+  EIGEN_ALIGN16 int aux[4];
   ei_pstore(aux, a);
   return aux[0] * aux[1] * aux[2] * aux[3];
 }
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index ddc7b4aaf..eb1c2d311 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -359,7 +359,7 @@ template<> EIGEN_STRONG_INLINE int ei_predux_mul<Packet4i>(const Packet4i& a)
   // after some experiments, it is seems this is the fastest way to implement it
   // for GCC (eg., reusing ei_pmul is very slow !)
   // TODO try to call _mm_mul_epu32 directly
-  EIGEN_ALIGN_128 int aux[4];
+  EIGEN_ALIGN16 int aux[4];
   ei_pstore(aux, a);
   return  (aux[0] * aux[1]) * (aux[2] * aux[3]);;
 }
@@ -378,7 +378,7 @@ template<> EIGEN_STRONG_INLINE int ei_predux_min<Packet4i>(const Packet4i& a)
 {
   // after some experiments, it is seems this is the fastest way to implement it
   // for GCC (eg., it does not like using std::min after the ei_pstore !!)
-  EIGEN_ALIGN_128 int aux[4];
+  EIGEN_ALIGN16 int aux[4];
   ei_pstore(aux, a);
   register int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
   register int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
@@ -399,7 +399,7 @@ template<> EIGEN_STRONG_INLINE int ei_predux_max<Packet4i>(const Packet4i& a)
 {
   // after some experiments, it is seems this is the fastest way to implement it
   // for GCC (eg., it does not like using std::min after the ei_pstore !!)
-  EIGEN_ALIGN_128 int aux[4];
+  EIGEN_ALIGN16 int aux[4];
   ei_pstore(aux, a);
   register int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
   register int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 71962bcae..fb149e50a 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -202,25 +202,29 @@ using Eigen::ei_cos;
 #define EIGEN_ASM_COMMENT(X)
 #endif
 
-/* EIGEN_ALIGN_128 forces data to be 16-byte aligned, EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
+/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements.
+ * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
  * so that vectorization doesn't affect binary compatibility.
  *
  * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
  * vectorized and non-vectorized code.
  */
 #if !EIGEN_ALIGN
-  #define EIGEN_ALIGN_128
+  #define EIGEN_ALIGN_TO_BOUNDARY(n)
 #elif (defined __GNUC__)
-  #define EIGEN_ALIGN_128 __attribute__((aligned(16)))
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
 #elif (defined _MSC_VER)
-  #define EIGEN_ALIGN_128 __declspec(align(16))
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
 #elif (defined __SUNPRO_CC)
   // FIXME not sure about this one:
-  #define EIGEN_ALIGN_128 __attribute__((aligned(16)))
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
 #else
-  #error Please tell me what is the equivalent of __attribute__((aligned(16))) for your compiler
+  #error Please tell me what is the equivalent of __attribute__((aligned(n))) for your compiler
 #endif
 
+#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
+#define EIGEN_ALIGN8 EIGEN_ALIGN_TO_BOUNDARY(8)
+
 #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD
   #define EIGEN_RESTRICT
 #endif
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index d86d40d68..1745ae5c6 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -99,10 +99,10 @@ template<typename Scalar> void packetmath()
   const int PacketSize = ei_packet_traits<Scalar>::size;
 
   const int size = PacketSize*4;
-  EIGEN_ALIGN_128 Scalar data1[ei_packet_traits<Scalar>::size*4];
-  EIGEN_ALIGN_128 Scalar data2[ei_packet_traits<Scalar>::size*4];
-  EIGEN_ALIGN_128 Packet packets[PacketSize*2];
-  EIGEN_ALIGN_128 Scalar ref[ei_packet_traits<Scalar>::size*4];
+  EIGEN_ALIGN16 Scalar data1[ei_packet_traits<Scalar>::size*4];
+  EIGEN_ALIGN16 Scalar data2[ei_packet_traits<Scalar>::size*4];
+  EIGEN_ALIGN16 Packet packets[PacketSize*2];
+  EIGEN_ALIGN16 Scalar ref[ei_packet_traits<Scalar>::size*4];
   for (int i=0; i<size; ++i)
   {
     data1[i] = ei_random<Scalar>();
@@ -202,9 +202,9 @@ template<typename Scalar> void packetmath_real()
   const int PacketSize = ei_packet_traits<Scalar>::size;
 
   const int size = PacketSize*4;
-  EIGEN_ALIGN_128 Scalar data1[ei_packet_traits<Scalar>::size*4];
-  EIGEN_ALIGN_128 Scalar data2[ei_packet_traits<Scalar>::size*4];
-  EIGEN_ALIGN_128 Scalar ref[ei_packet_traits<Scalar>::size*4];
+  EIGEN_ALIGN16 Scalar data1[ei_packet_traits<Scalar>::size*4];
+  EIGEN_ALIGN16 Scalar data2[ei_packet_traits<Scalar>::size*4];
+  EIGEN_ALIGN16 Scalar ref[ei_packet_traits<Scalar>::size*4];
   
   for (int i=0; i<size; ++i)
   {
diff --git a/test/unalignedassert.cpp b/test/unalignedassert.cpp
index ade1ab26e..8acc90158 100644
--- a/test/unalignedassert.cpp
+++ b/test/unalignedassert.cpp
@@ -24,52 +24,38 @@
 
 #include "main.h"
 
-struct Good1
+struct TestNew1
 {
   MatrixXd m; // good: m will allocate its own array, taking care of alignment.
-  Good1() : m(20,20) {}
+  TestNew1() : m(20,20) {}
 };
 
-struct Good2
+struct TestNew2
 {
-  Matrix3d m; // good: m's size isn't a multiple of 16 bytes, so m doesn't have to be aligned
+  Matrix3d m; // good: m's size isn't a multiple of 16 bytes, so m doesn't have to be 16-byte aligned,
+              // 8-byte alignment is good enough here, which we'll get automatically
 };
 
-struct Good3
+struct TestNew3
 {
-  Vector2f m; // good: same reason
+  Vector2f m; // good: m's size isn't a multiple of 16 bytes, so m doesn't have to be 16-byte aligned
 };
 
-struct Bad4
-{
-  Vector2d m; // bad: sizeof(m)%16==0 so alignment is required
-};
-
-struct Bad5
-{
-  Matrix<float, 2, 6> m; // bad: same reason
-};
-
-struct Bad6
-{
-  Matrix<double, 3, 4> m; // bad: same reason
-};
-
-struct Good7
+struct TestNew4
 {
   EIGEN_MAKE_ALIGNED_OPERATOR_NEW
   Vector2d m;
   float f; // make the struct have sizeof%16!=0 to make it a little more tricky when we allow an array of 2 such objects
 };
 
-struct Good8
+struct TestNew5
 {
   EIGEN_MAKE_ALIGNED_OPERATOR_NEW
-  float f; // try the f at first -- the EIGEN_ALIGN_128 attribute of m should make that still work
+  float f; // try the f at first -- the EIGEN_ALIGN16 attribute of m should make that still work
   Matrix4f m;
 };
 
-struct Good9
+struct TestNew6
 {
   Matrix<float,2,2,DontAlign> m; // good: no alignment requested
   float f;
@@ -94,34 +80,56 @@ void check_unalignedassert_good()
 
 #if EIGEN_ALIGN
 template<typename T>
-void check_unalignedassert_bad()
+void construct_at_boundary(int boundary)
 {
-  float buf[sizeof(T)+16];
-  float *unaligned = buf;
-  while((reinterpret_cast<size_t>(unaligned)&0xf)==0) ++unaligned; // make sure unaligned is really unaligned
-  T *x = ::new(static_cast<void*>(unaligned)) T;
+  char buf[sizeof(T)+256];
+  size_t _buf = reinterpret_cast<size_t>(buf);
+  _buf += (16 - (_buf % 16)); // make 16-byte aligned
+  _buf += boundary; // make exact boundary-aligned
+  T *x = ::new(reinterpret_cast<void*>(_buf)) T;
   x->~T();
 }
 #endif
 
 void unalignedassert()
 {
-  check_unalignedassert_good<Good1>();
-  check_unalignedassert_good<Good2>();
-  check_unalignedassert_good<Good3>();
-#if EIGEN_ALIGN
-  VERIFY_RAISES_ASSERT(check_unalignedassert_bad<Bad4>());
-  VERIFY_RAISES_ASSERT(check_unalignedassert_bad<Bad5>());
-  VERIFY_RAISES_ASSERT(check_unalignedassert_bad<Bad6>());
-#endif
+  construct_at_boundary<Vector2f>(8);
+  construct_at_boundary<Vector3f>(4);
+  construct_at_boundary<Vector4f>(16);
+  construct_at_boundary<Matrix2f>(16);
+  construct_at_boundary<Matrix3f>(4);
+  construct_at_boundary<Matrix4f>(16);
+  
+  construct_at_boundary<Vector2d>(16);
+  construct_at_boundary<Vector3d>(8);
+  construct_at_boundary<Vector4d>(16);
+  construct_at_boundary<Matrix2d>(16);
+  construct_at_boundary<Matrix3d>(8);
+  construct_at_boundary<Matrix4d>(16);
+  
+  check_unalignedassert_good<TestNew1>();
+  check_unalignedassert_good<TestNew2>();
+  check_unalignedassert_good<TestNew3>();
 
-  check_unalignedassert_good<Good7>();
-  check_unalignedassert_good<Good8>();
-  check_unalignedassert_good<Good9>();
+  check_unalignedassert_good<TestNew4>();
+  check_unalignedassert_good<TestNew5>();
+  check_unalignedassert_good<TestNew6>();
   check_unalignedassert_good<Depends<true> >();
-
+  
 #if EIGEN_ALIGN
-  VERIFY_RAISES_ASSERT(check_unalignedassert_bad<Depends<false> >());
+  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2f>(4));
+  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4f>(4));
+  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4f>(8));
+  VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix2f>(4));
+  VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix2f>(8));
+  VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix4f>(4));
+  VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix4f>(8));
+  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2d>(8));
+  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector3d>(4));
+  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4d>(8));
+  VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix2d>(8));
+  VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix3d>(4));
+  VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix4d>(8));
 #endif
 }