bump

backport from tip : actually stop on compile failure
Fix CoeffReadCost in Part: it must account for the cost of the
2026-04-10 11:34:33 +08:00 · 2010-01-10 11:30:10 -05:00 · 2010-01-06 17:17:40 +01:00 · 2010-01-02 13:04:04 -05:00 · 2010-01-02 12:54:55 -05:00 · 2010-01-02 12:45:49 -05:00
15 changed files with 290 additions and 123 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ set(INCLUDE_INSTALL_DIR
    "The directory where we install the header files"
    FORCE)

-set(EIGEN_VERSION_NUMBER "2.0.10")
+set(EIGEN_VERSION_NUMBER "2.0.11")
 set(EIGEN_VERSION "${EIGEN_VERSION_NUMBER}")

 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
--- a/Eigen/src/Core/Part.h
+++ b/Eigen/src/Core/Part.h
@@ -50,7 +50,7 @@ struct ei_traits<Part<MatrixType, Mode> > : ei_traits<MatrixType>
  typedef typename ei_unref<MatrixTypeNested>::type _MatrixTypeNested;
  enum {
    Flags = (_MatrixTypeNested::Flags & (HereditaryBits) & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit))) | Mode,
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+    CoeffReadCost = _MatrixTypeNested::CoeffReadCost + ConditionalJumpCost
  };
 };

--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -30,7 +30,7 @@

 #define EIGEN_WORLD_VERSION 2
 #define EIGEN_MAJOR_VERSION 0
-#define EIGEN_MINOR_VERSION 10
+#define EIGEN_MINOR_VERSION 11

 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                      (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -209,18 +209,47 @@ template<typename T, bool Align> inline void ei_conditional_aligned_delete(T *pt
  ei_conditional_aligned_free<Align>(ptr);
 }

-/** \internal \returns the number of elements which have to be skipped such that data are 16 bytes aligned */
-template<typename Scalar>
-inline static int ei_alignmentOffset(const Scalar* ptr, int maxOffset)
+/** \internal \returns the index of the first element of the array that is well aligned for vectorization.
+  *
+  * \param array the address of the start of the array
+  * \param size the size of the array
+  *
+  * \note If no element of the array is well aligned, the size of the array is returned. Typically,
+  * for example with SSE, "well aligned" means 16-byte-aligned. If vectorization is disabled or if the
+  * packet size for the given scalar type is 1, then everything is considered well-aligned.
+  *
+  * \note If the scalar type is vectorizable, we rely on the following assumptions: sizeof(Scalar) is a
+  * power of 2, the packet size in bytes is also a power of 2, and is a multiple of sizeof(Scalar). On the
+  * other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for
+  * example with Scalar=double on certain 32-bit platforms, see bug #79.
+  *
+  * There is also the variant ei_first_aligned(const MatrixBase&, Integer) defined in Coeffs.h.
+  */
+template<typename Scalar, typename Integer>
+inline static Integer ei_alignmentOffset(const Scalar* array, Integer size)
 {
  typedef typename ei_packet_traits<Scalar>::type Packet;
-  const int PacketSize = ei_packet_traits<Scalar>::size;
-  const int PacketAlignedMask = PacketSize-1;
-  const bool Vectorized = PacketSize>1;
-  return Vectorized
-          ? std::min<int>( (PacketSize - (int((size_t(ptr)/sizeof(Scalar))) & PacketAlignedMask))
-                           & PacketAlignedMask, maxOffset)
-          : 0;
+  enum { PacketSize = ei_packet_traits<Scalar>::size,
+         PacketAlignedMask = PacketSize-1
+  };
+  
+  if(PacketSize==1)
+  {
+    // Either there is no vectorization, or a packet consists of exactly 1 scalar so that all elements
+    // of the array have the same aligment.
+    return 0;
+  }
+  else if(size_t(array) & (sizeof(Scalar)-1))
+  {
+    // There is vectorization for this scalar type, but the array is not aligned to the size of a single scalar.
+    // Consequently, no element of the array is well aligned.
+    return size;
+  }
+  else
+  {
+    return std::min<Integer>( (PacketSize - (Integer((size_t(array)/sizeof(Scalar))) & PacketAlignedMask))
+                           & PacketAlignedMask, size);
+  }
 }

 /** \internal
--- a/Eigen/src/Geometry/Hyperplane.h
+++ b/Eigen/src/Geometry/Hyperplane.h
@@ -52,9 +52,9 @@ public:
  typedef _Scalar Scalar;
  typedef typename NumTraits<Scalar>::Real RealScalar;
  typedef Matrix<Scalar,AmbientDimAtCompileTime,1> VectorType;
-  typedef Matrix<Scalar,AmbientDimAtCompileTime==Dynamic
+  typedef Matrix<Scalar,int(AmbientDimAtCompileTime)==Dynamic
                        ? Dynamic
-                        : AmbientDimAtCompileTime+1,1> Coefficients;
+                        : int(AmbientDimAtCompileTime)+1,1> Coefficients;
  typedef Block<Coefficients,AmbientDimAtCompileTime,1> NormalReturnType;

  /** Default constructor without initialization */
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@@ -450,22 +450,31 @@ inline Scalar Quaternion<Scalar>::angularDistance(const Quaternion& other) const
 template <typename Scalar>
 Quaternion<Scalar> Quaternion<Scalar>::slerp(Scalar t, const Quaternion& other) const
 {
-  static const Scalar one = Scalar(1) - precision<Scalar>();
+  static const Scalar one = Scalar(1) - machine_epsilon<Scalar>();
  Scalar d = this->dot(other);
  Scalar absD = ei_abs(d);
+
+  Scalar scale0;
+  Scalar scale1;
+
  if (absD>=one)
-    return *this;
+  {
+    scale0 = Scalar(1) - t;
+    scale1 = t;
+  }
+  else
+  {
+    // theta is the angle between the 2 quaternions
+    Scalar theta = std::acos(absD);
+    Scalar sinTheta = ei_sin(theta);

-  // theta is the angle between the 2 quaternions
-  Scalar theta = std::acos(absD);
-  Scalar sinTheta = ei_sin(theta);
+    scale0 = ei_sin( ( Scalar(1) - t ) * theta) / sinTheta;
+    scale1 = ei_sin( ( t * theta) ) / sinTheta;
+    if (d<0)
+      scale1 = -scale1;
+  }

-  Scalar scale0 = ei_sin( ( Scalar(1) - t ) * theta) / sinTheta;
-  Scalar scale1 = ei_sin( ( t * theta) ) / sinTheta;
-  if (d<0)
-    scale1 = -scale1;
-
-  return Quaternion(scale0 * m_coeffs + scale1 * other.m_coeffs);
+  return Quaternion<Scalar>(scale0 * coeffs() + scale1 * other.coeffs());
 }

 // set from a rotation matrix
--- a/Eigen/src/LU/Inverse.h
+++ b/Eigen/src/LU/Inverse.h
@@ -29,8 +29,8 @@
 *** Part 1 : optimized implementations for fixed-size 2,3,4 cases ***
 ********************************************************************/

-template<typename MatrixType>
-void ei_compute_inverse_in_size2_case(const MatrixType& matrix, MatrixType* result)
+template<typename XprType, typename MatrixType>
+void ei_compute_inverse_in_size2_case(const XprType& matrix, MatrixType* result)
 {
  typedef typename MatrixType::Scalar Scalar;
  const Scalar invdet = Scalar(1) / matrix.determinant();
@@ -75,99 +75,155 @@ void ei_compute_inverse_in_size3_case(const MatrixType& matrix, MatrixType* resu
  result->coeffRef(2, 2) = matrix.minor(2,2).determinant() * invdet;
 }

-template<typename MatrixType>
-bool ei_compute_inverse_in_size4_case_helper(const MatrixType& matrix, MatrixType* result)
+template<typename MatrixType, typename Scalar = typename MatrixType::Scalar>
+struct ei_compute_inverse_in_size4_case
 {
-  /* Let's split M into four 2x2 blocks:
-    * (P Q)
-    * (R S)
-    * If P is invertible, with inverse denoted by P_inverse, and if
-    * (S - R*P_inverse*Q) is also invertible, then the inverse of M is
-    * (P' Q')
-    * (R' S')
-    * where
-    * S' = (S - R*P_inverse*Q)^(-1)
-    * P' = P1 + (P1*Q) * S' *(R*P_inverse)
-    * Q' = -(P_inverse*Q) * S'
-    * R' = -S' * (R*P_inverse)
-    */
-  typedef Block<MatrixType,2,2> XprBlock22;
-  typedef typename MatrixBase<XprBlock22>::PlainMatrixType Block22;
-  Block22 P_inverse;
-  if(ei_compute_inverse_in_size2_case_with_check(matrix.template block<2,2>(0,0), &P_inverse))
+  static void run(const MatrixType& matrix, MatrixType& result)
  {
-    const Block22 Q = matrix.template block<2,2>(0,2);
-    const Block22 P_inverse_times_Q = P_inverse * Q;
-    const XprBlock22 R = matrix.template block<2,2>(2,0);
-    const Block22 R_times_P_inverse = R * P_inverse;
-    const Block22 R_times_P_inverse_times_Q = R_times_P_inverse * Q;
-    const XprBlock22 S = matrix.template block<2,2>(2,2);
-    const Block22 X = S - R_times_P_inverse_times_Q;
-    Block22 Y;
-    ei_compute_inverse_in_size2_case(X, &Y);
-    result->template block<2,2>(2,2) = Y;
-    result->template block<2,2>(2,0) = - Y * R_times_P_inverse;
-    const Block22 Z = P_inverse_times_Q * Y;
-    result->template block<2,2>(0,2) = - Z;
-    result->template block<2,2>(0,0) = P_inverse + Z * R_times_P_inverse;
-    return true;
+    result.coeffRef(0,0) = matrix.minor(0,0).determinant();
+    result.coeffRef(1,0) = -matrix.minor(0,1).determinant();
+    result.coeffRef(2,0) = matrix.minor(0,2).determinant();
+    result.coeffRef(3,0) = -matrix.minor(0,3).determinant();
+    result.coeffRef(0,2) = matrix.minor(2,0).determinant();
+    result.coeffRef(1,2) = -matrix.minor(2,1).determinant();
+    result.coeffRef(2,2) = matrix.minor(2,2).determinant();
+    result.coeffRef(3,2) = -matrix.minor(2,3).determinant();
+    result.coeffRef(0,1) = -matrix.minor(1,0).determinant();
+    result.coeffRef(1,1) = matrix.minor(1,1).determinant();
+    result.coeffRef(2,1) = -matrix.minor(1,2).determinant();
+    result.coeffRef(3,1) = matrix.minor(1,3).determinant();
+    result.coeffRef(0,3) = -matrix.minor(3,0).determinant();
+    result.coeffRef(1,3) = matrix.minor(3,1).determinant();
+    result.coeffRef(2,3) = -matrix.minor(3,2).determinant();
+    result.coeffRef(3,3) = matrix.minor(3,3).determinant();
+    result /= (matrix.col(0).cwise()*result.row(0).transpose()).sum();
  }
-  else
-  {
-    return false;
-  }
-}
+};

+#ifdef EIGEN_VECTORIZE_SSE
+// The SSE code for the 4x4 float matrix inverse in this file comes from the file
+//   ftp://download.intel.com/design/PentiumIII/sml/24504301.pdf
+// its copyright information is:
+//   Copyright (C) 1999 Intel Corporation
+// See page ii of that document for legal stuff. Not being lawyers, we just assume
+// here that if Intel makes this document publically available, with source code
+// and detailed explanations, it's because they want their CPUs to be fed with
+// good code, and therefore they presumably don't mind us using it in Eigen.
 template<typename MatrixType>
-void ei_compute_inverse_in_size4_case(const MatrixType& _matrix, MatrixType* result)
+struct ei_compute_inverse_in_size4_case<MatrixType, float>
 {
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::RealScalar RealScalar;
+  static void run(const MatrixType& matrix, MatrixType& result)
+  {
+    // Variables (Streaming SIMD Extensions registers) which will contain cofactors and, later, the
+    // lines of the inverted matrix.
+    __m128 minor0, minor1, minor2, minor3;

-  // we will do row permutations on the matrix. This copy should have negligible cost.
-  // if not, consider working in-place on the matrix (const-cast it, but then undo the permutations
-  // to nevertheless honor constness)
-  typename MatrixType::PlainMatrixType matrix(_matrix);
+    // Variables which will contain the lines of the reference matrix and, later (after the transposition),
+    // the columns of the original matrix.
+    __m128 row0, row1, row2, row3;

-  // let's extract from the 2 first colums a 2x2 block whose determinant is as big as possible.
-  int good_row0, good_row1, good_i;
-  Matrix<RealScalar,6,1> absdet;
+    // Temporary variables and the variable that will contain the matrix determinant.
+    __m128 det, tmp1;

-  // any 2x2 block with determinant above this threshold will be considered good enough.
-  // The magic value 1e-1 here comes from experimentation. The bigger it is, the higher the precision,
-  // the slower the computation. This value 1e-1 gives precision almost as good as the brutal cofactors
-  // algorithm, both in average and in worst-case precision.
-  RealScalar d = (matrix.col(0).squaredNorm()+matrix.col(1).squaredNorm()) * RealScalar(1e-1);
-  #define ei_inv_size4_helper_macro(i,row0,row1) \
-  absdet[i] = ei_abs(matrix.coeff(row0,0)*matrix.coeff(row1,1) \
-                                - matrix.coeff(row0,1)*matrix.coeff(row1,0)); \
-  if(absdet[i] > d) { good_row0=row0; good_row1=row1; goto good; }
-  ei_inv_size4_helper_macro(0,0,1)
-  ei_inv_size4_helper_macro(1,0,2)
-  ei_inv_size4_helper_macro(2,0,3)
-  ei_inv_size4_helper_macro(3,1,2)
-  ei_inv_size4_helper_macro(4,1,3)
-  ei_inv_size4_helper_macro(5,2,3)
+    // Matrix transposition
+    const float *src = matrix.data();
+    tmp1  = _mm_loadh_pi(_mm_castpd_ps(_mm_load_sd((double*)src)), (__m64*)(src+ 4));
+    row1  = _mm_loadh_pi(_mm_castpd_ps(_mm_load_sd((double*)(src+8))), (__m64*)(src+12));
+    row0  = _mm_shuffle_ps(tmp1, row1, 0x88);
+    row1  = _mm_shuffle_ps(row1, tmp1, 0xDD);
+    tmp1  = _mm_loadh_pi(_mm_castpd_ps(_mm_load_sd((double*)(src+ 2))), (__m64*)(src+ 6));
+    row3  = _mm_loadh_pi(_mm_castpd_ps(_mm_load_sd((double*)(src+10))), (__m64*)(src+14));
+    row2  = _mm_shuffle_ps(tmp1, row3, 0x88);
+    row3  = _mm_shuffle_ps(row3, tmp1, 0xDD);

-  // no 2x2 block has determinant bigger than the threshold. So just take the one that
-  // has the biggest determinant
-  absdet.maxCoeff(&good_i);
-  good_row0 = good_i <= 2 ? 0 : good_i <= 4 ? 1 : 2;
-  good_row1 = good_i <= 2 ? good_i+1 : good_i <= 4 ? good_i-1 : 3;

-  // now good_row0 and good_row1 are correctly set
-  good:
+    // Cofactors calculation. Because in the process of cofactor computation some pairs in three-
+    // element products are repeated, it is not reasonable to load these pairs anew every time. The
+    // values in the registers with these pairs are formed using shuffle instruction. Cofactors are
+    // calculated row by row (4 elements are placed in 1 SP FP SIMD floating point register).

-  // do row permutations to move this 2x2 block to the top
-  matrix.row(0).swap(matrix.row(good_row0));
-  matrix.row(1).swap(matrix.row(good_row1));
-  // now applying our helper function is numerically stable
-  ei_compute_inverse_in_size4_case_helper(matrix, result);
-  // Since we did row permutations on the original matrix, we need to do column permutations
-  // in the reverse order on the inverse
-  result->col(1).swap(result->col(good_row1));
-  result->col(0).swap(result->col(good_row0));
-}
+    tmp1   = _mm_mul_ps(row2, row3);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    minor0  = _mm_mul_ps(row1, tmp1);
+    minor1  = _mm_mul_ps(row0, tmp1);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor0  = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
+    minor1  = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
+    minor1  = _mm_shuffle_ps(minor1, minor1, 0x4E);
+    //    -----------------------------------------------
+    tmp1   = _mm_mul_ps(row1, row2);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    minor0  = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
+    minor3  = _mm_mul_ps(row0, tmp1);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor0  = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
+    minor3  = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
+    minor3  = _mm_shuffle_ps(minor3, minor3, 0x4E);
+    //    -----------------------------------------------
+    tmp1   = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    row2   = _mm_shuffle_ps(row2, row2, 0x4E);
+    minor0  = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
+    minor2  = _mm_mul_ps(row0, tmp1);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor0  = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
+    minor2  = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
+    minor2  = _mm_shuffle_ps(minor2, minor2, 0x4E);
+    //    -----------------------------------------------
+    tmp1   = _mm_mul_ps(row0, row1);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
+    minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
+    minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
+    //           -----------------------------------------------
+    tmp1   = _mm_mul_ps(row0, row3);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
+    minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
+    minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
+    //           -----------------------------------------------
+    tmp1   = _mm_mul_ps(row0, row2);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
+    minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
+    minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
+
+    // Evaluation of determinant and its reciprocal value. In the original Intel document,
+    // 1/det was evaluated using a fast rcpps command with subsequent approximation using
+    // the Newton-Raphson algorithm. Here, we go for a IEEE-compliant division instead,
+    // so as to not compromise precision at all.
+    det    = _mm_mul_ps(row0, minor0);
+    det    = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
+    det    = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
+//     tmp1= _mm_rcp_ss(det);
+//     det= _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
+    det    = _mm_div_ss(_mm_set_ss(1.0f), det); // <--- yay, one original line not copied from Intel
+    det    = _mm_shuffle_ps(det, det, 0x00);
+    // warning, Intel's variable naming is very confusing: now 'det' is 1/det !
+
+    // Multiplication of cofactors by 1/det. Storing the inverse matrix to the address in pointer src.
+    minor0 = _mm_mul_ps(det, minor0);
+    float *dst = result.data();
+    _mm_storel_pi((__m64*)(dst), minor0);
+    _mm_storeh_pi((__m64*)(dst+2), minor0);
+    minor1 = _mm_mul_ps(det, minor1);
+    _mm_storel_pi((__m64*)(dst+4), minor1);
+    _mm_storeh_pi((__m64*)(dst+6), minor1);
+    minor2 = _mm_mul_ps(det, minor2);
+    _mm_storel_pi((__m64*)(dst+ 8), minor2);
+    _mm_storeh_pi((__m64*)(dst+10), minor2);
+    minor3 = _mm_mul_ps(det, minor3);
+    _mm_storel_pi((__m64*)(dst+12), minor3);
+    _mm_storeh_pi((__m64*)(dst+14), minor3);
+  }
+};
+#endif

 /***********************************************
 *** Part 2 : selector and MatrixBase methods ***
@@ -216,7 +272,7 @@ struct ei_compute_inverse<MatrixType, 4>
 {
  static inline void run(const MatrixType& matrix, MatrixType* result)
  {
-    ei_compute_inverse_in_size4_case(matrix, result);
+    ei_compute_inverse_in_size4_case<MatrixType>::run(matrix, *result);
  }
 };

--- a/Eigen/src/Sparse/TriangularSolver.h
+++ b/Eigen/src/Sparse/TriangularSolver.h
@@ -43,8 +43,11 @@ struct ei_solve_triangular_selector<Lhs,Rhs,LowerTriangular,RowMajor|IsSparse>
        {
          lastVal = it.value();
          lastIndex = it.index();
+          if(lastIndex == i)
+            break;
          tmp -= lastVal * other.coeff(lastIndex,col);
        }
+
        if (Lhs::Flags & UnitDiagBit)
          other.coeffRef(i,col) = tmp;
        else
--- a/doc/StlContainers.dox
+++ b/doc/StlContainers.dox
@@ -11,7 +11,7 @@ namespace Eigen {

 \section summary Executive summary

-Using STL containers on \ref FixedSizeVectorizable "fixed-size vectorizable Eigen types" requires taking the following two steps:
+Using STL containers on \ref FixedSizeVectorizable "fixed-size vectorizable Eigen types", or classes having members of such types, requires taking the following two steps:

 \li A 16-byte-aligned allocator must be used. Eigen does provide one ready for use: aligned_allocator.
 \li If you want to use the std::vector container, you need to \#include <Eigen/StdVector>.
--- a/doc/UnalignedArrayAssert.dox
+++ b/doc/UnalignedArrayAssert.dox
@@ -55,16 +55,17 @@ Note that here, Eigen::Vector2d is only used as an example, more generally the i

 \section c2 Cause 2: STL Containers

-If you use STL Containers such as std::vector, std::map, ..., with Eigen objects, like this,
+If you use STL Containers such as std::vector, std::map, ..., with Eigen objects, or with classes containing Eigen objects, like this,

 \code
 std::vector<Eigen::Matrix2f> my_vector;
-std::map<int, Eigen::Matrix2f> my_map;
+struct my_class { ... Eigen::Matrix2f m; ... };
+std::map<int, my_class> my_map;
 \endcode

 then you need to read this separate page: \ref StlContainers "Using STL Containers with Eigen".

-Note that here, Eigen::Matrix2f is only used as an example, more generally the issue arises for all \ref FixedSizeVectorizable "fixed-size vectorizable Eigen types".
+Note that here, Eigen::Matrix2f is only used as an example, more generally the issue arises for all \ref FixedSizeVectorizable "fixed-size vectorizable Eigen types" and \ref StructHavingEigenMembers "structures having such Eigen objects as member".

 \section c3 Cause 3: Passing Eigen objects by value

--- a/scripts/eigen_gen_docs
+++ b/scripts/eigen_gen_docs
@@ -11,7 +11,7 @@ USER=${USER:-'orzel'}
 # step 1 : build
 # todo if 'build is not there, create one:
 #mkdir build
-(cd build && cmake .. && make -j3 doc) || (echo "make failed"; exit 1)
+(cd build && cmake .. && make -j3 doc) || echo "make failed"; exit 1
 #todo: n+1 where n = number of cpus

 #step 2 : upload
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -180,6 +180,7 @@ ei_add_test(meta)
 ei_add_test(sizeof)
 ei_add_test(dynalloc)
 ei_add_test(nomalloc)
+ei_add_test(first_aligned)
 ei_add_test(mixingtypes)
 ei_add_test(packetmath)
 ei_add_test(unalignedassert)
--- a/test/first_aligned.cpp
+++ b/test/first_aligned.cpp
@@ -0,0 +1,64 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#include "main.h"
+
+template<typename Scalar>
+void test_first_aligned_helper(Scalar *array, int size)
+{
+  const int packet_size = sizeof(Scalar) * ei_packet_traits<Scalar>::size;
+  VERIFY(((size_t(array) + sizeof(Scalar) * ei_alignmentOffset(array, size)) % packet_size) == 0);
+}
+
+template<typename Scalar>
+void test_none_aligned_helper(Scalar *array, int size)
+{
+  VERIFY(ei_packet_traits<Scalar>::size == 1 || ei_alignmentOffset(array, size) == size);
+}
+
+struct some_non_vectorizable_type { float x; };
+
+void test_first_aligned()
+{
+  EIGEN_ALIGN_128 float array_float[100];
+  test_first_aligned_helper(array_float, 50);
+  test_first_aligned_helper(array_float+1, 50);
+  test_first_aligned_helper(array_float+2, 50);
+  test_first_aligned_helper(array_float+3, 50);
+  test_first_aligned_helper(array_float+4, 50);
+  test_first_aligned_helper(array_float+5, 50);
+  
+  EIGEN_ALIGN_128 double array_double[100];
+  test_first_aligned_helper(array_float, 50);
+  test_first_aligned_helper(array_float+1, 50);
+  test_first_aligned_helper(array_float+2, 50);
+  
+  double *array_double_plus_4_bytes = (double*)(size_t(array_double)+4);
+  test_none_aligned_helper(array_double_plus_4_bytes, 50);
+  test_none_aligned_helper(array_double_plus_4_bytes+1, 50);
+  
+  some_non_vectorizable_type array_nonvec[100];
+  test_first_aligned_helper(array_nonvec, 100);
+  test_none_aligned_helper(array_nonvec, 100);
+}
--- a/test/prec_inverse_4x4.cpp
+++ b/test/prec_inverse_4x4.cpp
@@ -45,7 +45,6 @@ template<typename MatrixType> void inverse_permutation_4x4()
 {
  typedef typename MatrixType::Scalar Scalar;
  typedef typename MatrixType::RealScalar RealScalar;
-  double error_max = 0.;
  Vector4i indices(0,1,2,3);
  for(int i = 0; i < 24; ++i)
  {
@@ -56,12 +55,9 @@ template<typename MatrixType> void inverse_permutation_4x4()
    m(indices(3),3) = 1;
    MatrixType inv = m.inverse();
    double error = double( (m*inv-MatrixType::Identity()).norm() / epsilon<Scalar>() );
-    error_max = std::max(error_max, error);
+    VERIFY(error == 0.0);
    std::next_permutation(indices.data(),indices.data()+4);
  }
-  std::cerr << "inverse_permutation_4x4, Scalar = " << type_name<Scalar>() << std::endl;
-  EIGEN_DEBUG_VAR(error_max);
-  VERIFY(error_max < 1. );
 }

 template<typename MatrixType> void inverse_general_4x4(int repeat)
@@ -86,8 +82,8 @@ template<typename MatrixType> void inverse_general_4x4(int repeat)
  double error_avg = error_sum / repeat;
  EIGEN_DEBUG_VAR(error_avg);
  EIGEN_DEBUG_VAR(error_max);
-  VERIFY(error_avg < (NumTraits<Scalar>::IsComplex ? 8.4 : 1.4) );
-  VERIFY(error_max < (NumTraits<Scalar>::IsComplex ? 160.0 : 75.) );
+  VERIFY(error_avg < (NumTraits<Scalar>::IsComplex ? 8.0 : 1.0));
+  VERIFY(error_max < (NumTraits<Scalar>::IsComplex ? 64.0 : 20.0));
 }

 void test_prec_inverse_4x4()
--- a/test/sparse_solvers.cpp
+++ b/test/sparse_solvers.cpp
@@ -68,12 +68,20 @@ template<typename Scalar> void sparse_solvers(int rows, int cols)
    VERIFY_IS_APPROX(refMat2.template marked<LowerTriangular>().solveTriangular(vec2),
                     m2.template marked<LowerTriangular>().solveTriangular(vec3));

+    // lower - transpose
+    initSparse<Scalar>(density, refMat2, m2, ForceNonZeroDiag|MakeLowerTriangular, &zeroCoords, &nonzeroCoords);
+    VERIFY_IS_APPROX(refMat2.template marked<LowerTriangular>().transpose().solveTriangular(vec2),
+                     m2.template marked<LowerTriangular>().transpose().solveTriangular(vec3));
+
    // upper
    initSparse<Scalar>(density, refMat2, m2, ForceNonZeroDiag|MakeUpperTriangular, &zeroCoords, &nonzeroCoords);
    VERIFY_IS_APPROX(refMat2.template marked<UpperTriangular>().solveTriangular(vec2),
                     m2.template marked<UpperTriangular>().solveTriangular(vec3));

-    // TODO test row major
+    // upper - transpose
+    initSparse<Scalar>(density, refMat2, m2, ForceNonZeroDiag|MakeUpperTriangular, &zeroCoords, &nonzeroCoords);
+    VERIFY_IS_APPROX(refMat2.template marked<UpperTriangular>().transpose().solveTriangular(vec2),
+                     m2.template marked<UpperTriangular>().transpose().solveTriangular(vec3));
  }

  // test LLT
Author	SHA1	Message	Date
Benoit Jacob	5f73a8df20	bump	2010-01-10 11:30:10 -05:00
Thomas Capricelli	8a6d5f10dc	backport from tip : actually stop on compile failure	2010-01-06 17:17:40 +01:00
Benoit Jacob	ba6ed5fa5f	Fix CoeffReadCost in Part: it must account for the cost of the conditional jump. This makes Part considered an "expensive" xpr that must be evaluated in operations such as Product. This fixes bug #80.	2010-01-02 13:04:04 -05:00
Benoit Jacob	e4c88c14ec	clarify docs as requested on the forum	2010-01-02 12:54:55 -05:00
Benoit Jacob	74207a31fa	backport the fix to bug #79 , and the unit test	2010-01-02 12:45:49 -05:00
Benoit Jacob	6fd9248c09	add Intel copyright info	2009-12-15 08:43:31 -05:00
Benoit Jacob	4262117f84	backport 4x4 inverse changes: - use cofactors - use Intel's SSE code in the float case	2009-12-15 08:16:48 -05:00
Gael Guennebaud	b581cb870c	fix #74 : sparse triangular solver for lower/row-major matrices	2009-12-14 10:20:35 +01:00
Gael Guennebaud	72fc81dd9d	backport quaternion slerp precision fix	2009-12-05 18:28:17 +01:00
Gael Guennebaud	f36650b00a	fix MSVC10 compilation issues	2009-12-02 19:34:37 +01:00
Benoit Jacob	8d31f58ea1	fix bug #70 Was trying to apply stupid invertibility check to top-left 2x2 corner.	2009-11-26 15:33:07 -05:00
Benoit Jacob	a161a70696	Added tag 2.0.10 for changeset `8f1ce52e76`	2009-11-25 08:54:17 -05:00