mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Merged eigen/eigen into default
This commit is contained in:
@@ -204,7 +204,7 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__F16C__)
|
||||
#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG)
|
||||
// We can use the optimized fp16 to float and float to fp16 conversion routines
|
||||
#define EIGEN_HAS_FP16_C
|
||||
#endif
|
||||
@@ -214,10 +214,14 @@
|
||||
#include <vector_types.h>
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
||||
#define EIGEN_HAS_CUDA_FP16
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined EIGEN_HAS_CUDA_FP16
|
||||
#include <host_defines.h>
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
|
||||
#if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
|
||||
#define EIGEN_HAS_OPENMP
|
||||
#endif
|
||||
|
||||
@@ -81,6 +81,8 @@ public:
|
||||
* This is a compile time mapping from {1,Small,Large}^3 -> {product types} */
|
||||
// FIXME I'm not sure the current mapping is the ideal one.
|
||||
template<int M, int N> struct product_type_selector<M,N,1> { enum { ret = OuterProduct }; };
|
||||
template<int M> struct product_type_selector<M, 1, 1> { enum { ret = LazyCoeffBasedProductMode }; };
|
||||
template<int N> struct product_type_selector<1, N, 1> { enum { ret = LazyCoeffBasedProductMode }; };
|
||||
template<int Depth> struct product_type_selector<1, 1, Depth> { enum { ret = InnerProduct }; };
|
||||
template<> struct product_type_selector<1, 1, 1> { enum { ret = InnerProduct }; };
|
||||
template<> struct product_type_selector<Small,1, Small> { enum { ret = CoeffBasedProductMode }; };
|
||||
|
||||
@@ -168,11 +168,12 @@ MatrixBase<Derived>::stableNorm() const
|
||||
DerivedCopy copy(derived());
|
||||
|
||||
enum {
|
||||
CanAlign = (int(Flags)&DirectAccessBit) || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME
|
||||
CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit)
|
||||
|| (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
|
||||
) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT) // ifwe cannot allocate on the stack, then let's not bother about this optimization
|
||||
};
|
||||
typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
|
||||
typename DerivedCopyClean
|
||||
::ConstSegmentReturnType>::type SegmentWrapper;
|
||||
typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
|
||||
Index n = size();
|
||||
|
||||
if(n==1)
|
||||
|
||||
@@ -11,8 +11,8 @@
|
||||
#define EIGEN_GENERAL_BLOCK_PANEL_H
|
||||
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
|
||||
@@ -36,7 +36,7 @@ const std::ptrdiff_t defaultL3CacheSize = 512*1024;
|
||||
#endif
|
||||
|
||||
/** \internal */
|
||||
struct CacheSizes {
|
||||
struct CacheSizes {
|
||||
CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
|
||||
int l1CacheSize, l2CacheSize, l3CacheSize;
|
||||
queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
|
||||
@@ -107,13 +107,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
enum {
|
||||
kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
|
||||
ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
|
||||
k_mask = -8,
|
||||
|
||||
kr = 8,
|
||||
mr = Traits::mr,
|
||||
mr_mask = -mr,
|
||||
|
||||
nr = Traits::nr,
|
||||
nr_mask = -nr
|
||||
nr = Traits::nr
|
||||
};
|
||||
// Increasing k gives us more time to prefetch the content of the "C"
|
||||
// registers. However once the latency is hidden there is no point in
|
||||
@@ -121,7 +117,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
// experimentally).
|
||||
const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320);
|
||||
if (k_cache < k) {
|
||||
k = k_cache & k_mask;
|
||||
k = k_cache - (k_cache % kr);
|
||||
eigen_internal_assert(k > 0);
|
||||
}
|
||||
|
||||
@@ -130,10 +126,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
if (n_cache <= n_per_thread) {
|
||||
// Don't exceed the capacity of the l2 cache.
|
||||
eigen_internal_assert(n_cache >= static_cast<Index>(nr));
|
||||
n = n_cache & nr_mask;
|
||||
n = n_cache - (n_cache % nr);
|
||||
eigen_internal_assert(n > 0);
|
||||
} else {
|
||||
n = (std::min<Index>)(n, (n_per_thread + nr - 1) & nr_mask);
|
||||
n = (std::min<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
|
||||
}
|
||||
|
||||
if (l3 > l2) {
|
||||
@@ -141,10 +137,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
|
||||
const Index m_per_thread = numext::div_ceil(m, num_threads);
|
||||
if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
|
||||
m = m_cache & mr_mask;
|
||||
m = m_cache - (m_cache % mr);
|
||||
eigen_internal_assert(m > 0);
|
||||
} else {
|
||||
m = (std::min<Index>)(m, (m_per_thread + mr - 1) & mr_mask);
|
||||
m = (std::min<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -156,23 +152,23 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
l2 = 32*1024;
|
||||
l3 = 512*1024;
|
||||
#endif
|
||||
|
||||
|
||||
// Early return for small problems because the computation below are time consuming for small problems.
|
||||
// Perhaps it would make more sense to consider k*n*m??
|
||||
// Note that for very tiny problem, this function should be bypassed anyway
|
||||
// because we use the coefficient-based implementation for them.
|
||||
if((std::max)(k,(std::max)(m,n))<48)
|
||||
return;
|
||||
|
||||
|
||||
typedef typename Traits::ResScalar ResScalar;
|
||||
enum {
|
||||
k_peeling = 8,
|
||||
k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
|
||||
k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
|
||||
};
|
||||
|
||||
|
||||
// ---- 1st level of blocking on L1, yields kc ----
|
||||
|
||||
|
||||
// Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
|
||||
// of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
|
||||
// We also include a register-level block of the result (mx x nr).
|
||||
@@ -187,12 +183,12 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
// while keeping the same number of sweeps over the result.
|
||||
k = (k%max_kc)==0 ? max_kc
|
||||
: max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
|
||||
|
||||
|
||||
eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
|
||||
}
|
||||
|
||||
|
||||
// ---- 2nd level of blocking on max(L2,L3), yields nc ----
|
||||
|
||||
|
||||
// TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
|
||||
// actual_l2 = max(l2, l3/nb_core_sharing_l3)
|
||||
// The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
|
||||
@@ -202,7 +198,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
#else
|
||||
const Index actual_l2 = 1572864; // == 1.5 MB
|
||||
#endif
|
||||
|
||||
|
||||
// Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
|
||||
// The second half is implicitly reserved to access the result and lhs coefficients.
|
||||
// When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
|
||||
|
||||
@@ -43,7 +43,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
|
||||
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
||||
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
|
||||
const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride,
|
||||
const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking)
|
||||
const ResScalar& alpha, level3_blocking<RhsScalar,LhsScalar>& blocking)
|
||||
{
|
||||
general_matrix_matrix_triangular_product<Index,
|
||||
RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
|
||||
|
||||
@@ -27,13 +27,13 @@ struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,C
|
||||
HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
|
||||
};
|
||||
static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
|
||||
const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha);
|
||||
const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha);
|
||||
};
|
||||
|
||||
template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
|
||||
EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
|
||||
::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
|
||||
const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
|
||||
const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha)
|
||||
{
|
||||
static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
|
||||
Index size = (std::min)(_rows,_cols);
|
||||
|
||||
Reference in New Issue
Block a user