Compare commits

...

482 Commits

Author SHA1 Message Date
Gael Guennebaud
81bdde705c Bump to 3.3.5 2018-07-23 11:33:42 +02:00
Gael Guennebaud
06fc5761fa Oopps, EIGEN_COMP_MSVC is not available before including Eigen.
(grafted from de70671937
)
2018-07-20 17:51:17 +02:00
Gael Guennebaud
a185bc485c Disable optimization for sparse_product unit test with MSVC 2013, otherwise it takes several hours to build.
(grafted from 56a750b6cc
)
2018-07-20 08:36:38 -07:00
Gael Guennebaud
96134409fc Fix weird issue with MSVC 2013
(grafted from 3a2dc3869e
)
2018-07-18 02:26:43 -07:00
Gael Guennebaud
ab3fa2e123 Fix GeneralizedEigenSolver when requesting for eigenvalues only.
(grafted from a87cff20df
)
2018-07-14 09:38:49 +02:00
Gael Guennebaud
ae6e5caa40 Fix unit test
(grafted from a7b313a16c
)
2018-07-01 22:45:47 +02:00
Gael Guennebaud
483beabab9 bug #1560 fix product with a 1x1 diagonal matrix
(grafted from ee5864f72e
)
2018-06-25 10:30:12 +02:00
Jayaram Bobba
5c59564bfb fix AVX512 plog
(grafted from b7b868d1c4
)
2018-04-20 13:39:18 -07:00
Gael Guennebaud
1939c971a3 AVX512: _mm512_rsqrt28_ps is available for AVX512ER only
(grafted from 40b4bf3d32
)
2018-04-03 14:36:27 +02:00
Gael Guennebaud
c2f9e6cb37 AVX512: fix psqrt and prsqrt
(grafted from 7b0630315f
)
2018-04-03 14:12:50 +02:00
Rasmus Munk Larsen
1641a6cdd5 Fix typo in pbend for AltiVec.
(grafted from bda71ad394
)
2018-06-22 15:04:35 -07:00
Rasmus Munk Larsen
fea50d40ea Fix oversharding bug in parallelFor.
(grafted from 5418154a45
)
2018-06-20 17:51:48 -07:00
Gael Guennebaud
c1128efb6c fix md5sum of lapack_addons
(grafted from b8271bb368
)
2018-06-15 14:21:29 +02:00
Gael Guennebaud
20ca86888e bug #1555: compilation fix with XLC 2018-06-21 10:28:58 +02:00
Gael Guennebaud
36a1cd87d9 Fiw some warnings in dox examples
(grafted from c25034710e
)
2018-06-07 16:09:22 +02:00
Gael Guennebaud
523e442a7b Fix warning
(grafted from c723ffd763
)
2018-06-07 15:56:20 +02:00
Gael Guennebaud
48048172e5 Fix int versus Index
(grafted from 37348d03ae
)
2018-06-07 15:56:43 +02:00
Gael Guennebaud
e9bd839b13 Fix warning
(grafted from af7c83b9a2
)
2018-06-07 15:45:24 +02:00
Gael Guennebaud
3df78d5afc Fix MSVC warning C4290: C++ exception specification ignored except to indicate a function is not __declspec(nothrow)
(grafted from 7fe29aceeb
)
2018-06-07 15:36:20 +02:00
Gael Guennebaud
352489edbe Fix short vs long 2018-06-07 15:26:04 +02:00
Gael Guennebaud
450c5e5d27 Fix compilation with MSVC by reverting to char* for _mm_prefetch except for PGI (the later being the one that has the wrong prototype).
(grafted from 7134fa7a2e
)
2018-06-07 09:33:10 +02:00
Gael Guennebaud
64cc5f8512 Don't run hg on non mercurial clone
(grafted from 84868da904
)
2018-05-31 21:21:57 +02:00
Gael Guennebaud
656712d48f Doc: add aliasing in common pitfaffs.
(grafted from 6af1433cb5
)
2018-05-29 22:37:47 +02:00
Gael Guennebaud
971b32440c Define pcast<> for SSE types even when AVX is enabled. (otherwise float are silently reinterpreted as int instead of being converted)
(grafted from 647b724a36
)
2018-05-29 20:46:46 +02:00
Gael Guennebaud
bb87f618bf Fix compilation and SSE support with PGI compiler
(grafted from 49262dfee6
)
2018-05-29 15:09:31 +02:00
Jeff Trull
2f9de52245 Add tests for sparseQR results (value and size) covering bugs #1522 and #1544 2018-04-21 10:26:30 -07:00
Jeff Trull
2136cfa17e Make sparse QR result sizes consistent with dense QR, with the following rules:
1) Q is always square
2) Q*R*P' is valid and recovers the original matrix

This implies that the size of Q is the number of rows in the original matrix, square,
and that the size of R is the size of the original matrix.
2018-02-15 15:00:31 -08:00
Christoph Hertzberg
39125654ce bug #1544: Generate correct Q matrix in complex case. Original patch was by Jeff Trull in PR-386. 2018-05-17 19:17:01 +02:00
Gael Guennebaud
927d023cea Fix compilation with NEON+MSVC
(grafted from 6e7118265d
)
2018-04-26 10:50:41 +02:00
Gael Guennebaud
1e2d2693b9 bug #1428: atempt to make NEON vectorization compilable by MSVC.
The workaround is to wrap NEON packet types to make them different c++ types.
(grafted from e8ca5166a9
)
2018-04-24 11:19:49 +02:00
Gael Guennebaud
7634a44bfe Fix "used uninitialized" warnings
(grafted from 2f3287da7d
)
2018-04-24 17:17:25 +02:00
Gael Guennebaud
2480d04ac7 Workaround warning
(grafted from 3ffd449ef5
)
2018-04-24 17:11:51 +02:00
Gael Guennebaud
c92536d926 workaround MSVC 2013 compilation issue (ambiguous call)
(grafted from a57e6e5f0f
)
2018-04-23 15:31:51 +02:00
Gael Guennebaud
80af7d6a47 bug #1543: fix linear indexing in generic block evaluation (this completes the fix in commit 12efc7d41b
)
(grafted from 5679e439e0
)
2018-04-23 14:40:16 +02:00
Gael Guennebaud
87f9e301f9 Fix unit test
(grafted from 35b31353ab
)
2018-04-22 22:49:08 +02:00
Christoph Hertzberg
542fb03968 Fix enum-compare warning 2018-04-20 23:11:37 +02:00
Christoph Hertzberg
f90d136c84 Add parenthesis to fix compiler warnings 2018-04-15 18:43:56 +02:00
Gael Guennebaud
877a2b64c9 fix const cast in NEON
(grafted from 686fb57233
)
2018-04-18 18:46:34 +02:00
Dmitriy Korchemkin
e6577f3c30 Cast zeros to Scalar in RealSchur 2018-04-18 13:52:46 +03:00
Gael Guennebaud
69e01a2999 update cdash 2018-04-17 17:22:56 +02:00
Christoph Hertzberg
5f71579a2d Another fix to make boost::multiprecision compile again 2018-04-13 20:22:57 +02:00
Christoph Hertzberg
686e0749a5 Recent Adolc versions require C++11 2018-04-13 19:10:23 +02:00
Christoph Hertzberg
385d8b5e42 Make hypot_impl compile again for types with expression-templates (e.g., boost::multiprecision) 2018-04-13 19:01:37 +02:00
Christoph Hertzberg
4662c610c1 SelfAdjointView<...,Mode> causes a static assert since commit d820ab9edc 2018-04-13 19:00:34 +02:00
Gael Guennebaud
906a98fe39 fix linking issue
(grafted from 7a9089c33c
)
2018-04-13 08:51:47 +02:00
Gael Guennebaud
1c4fdad7bd bug #1520: workaround some -Wfloat-equal warnings by calling std::equal_to 2018-04-11 15:24:13 +02:00
Gael Guennebaud
3f711f3356 extend doxygen splitter for huge screens
(grafted from 79266fec75
)
2018-04-11 11:31:17 +02:00
Gael Guennebaud
b02ab76847 Update header/footer for doxygen 1.8.13
(grafted from 426052ef6e
)
2018-04-11 11:30:34 +02:00
Gael Guennebaud
5fec52ced1 Fix javascript hacks for oxygen 1.8.13
(grafted from 9c8decffbf
)
2018-04-11 11:30:14 +02:00
Gael Guennebaud
bde2bfcee8 bug #1538: update manual pages regarding BDCSVD.
(grafted from e798466871
)
2018-04-11 10:46:11 +02:00
Gael Guennebaud
eab7afe252 Fix MKL backend for symmetric eigenvalues on row-major matrices.
(grafted from add15924ac
)
2018-04-09 13:29:26 +02:00
Gael Guennebaud
81e94eea02 Fix cmake scripts with no fortran compiler
(grafted from c2624c0318
)
2018-04-07 08:45:19 +02:00
Gael Guennebaud
a2a2c3c865 bug #1509: fix computeInverseWithCheck for complexes
(grafted from 2f833b1c64
)
2018-04-04 15:47:46 +02:00
Gael Guennebaud
90cd199d4b Factories code between numext::hypot and scalar_hyot_op functor.
(grafted from 4213b63f5c
)
2018-04-04 15:12:43 +02:00
Gael Guennebaud
b18e2d422b bug #1521: avoid signalling NaN in hypot and make it std::complex<> friendly.
(grafted from e116f6847e
)
2018-04-04 13:47:23 +02:00
Gael Guennebaud
892c0a79ce bug #1494: makes pmin/pmax behave on Altivec/VSX as on x86 regading NaNs
(grafted from e91e314347
)
2018-04-04 11:39:19 +02:00
Gael Guennebaud
59398aa2bb comment unreachable code
(grafted from 112c899304
)
2018-04-03 23:16:43 +02:00
Gael Guennebaud
170914dbbc Fix compilation of product with inverse transpositions (e.g., mat * Transpositions().inverse())
(grafted from a1292395d6
)
2018-04-03 23:06:44 +02:00
Gael Guennebaud
866d222d60 commit 45e9c9996da790b55ed9c4b0dfeae49492ac5c46 (HEAD -> memory_fix)
Author: George Burgess IV <gbiv@google.com>
Date:   Thu Mar 1 11:20:24 2018 -0800

    Prefer `::operator new` to `new`

    The C++ standard allows compilers much flexibility with `new`
    expressions, including eliding them entirely
    (https://godbolt.org/g/yS6i91). However, calls to `operator new` are
    required to be treated like opaque function calls.

    Since we're calling `new` for side-effects other than allocating heap
    memory, we should prefer the less flexible version.

    Signed-off-by: George Burgess IV <gbiv@google.com>
(grafted from 8c7b5158a1
)
2018-04-03 17:15:38 +02:00
Gael Guennebaud
86a939451c bug #1527: fix support for MKL's VML (destination was not properly resized)
(grafted from dd4cc6bd9e
)
2018-04-03 17:11:15 +02:00
Gael Guennebaud
9ff3150243 bug #1528: better use numeric_limits::min() instead of 1/highest() that with underflow.
(grafted from c5b56f1fb2
)
2018-04-03 16:49:35 +02:00
Benoit Steiner
a7144f8d6a Made the TensorStorage class compile with clang 3.9
(grafted from de7b0fdea9
)
2017-02-28 13:52:22 -08:00
Gael Guennebaud
273738ba6f bug #1516: add assertion for out-of-range diagonal index in MatrixBase::diagonal(i)
(grafted from 8d0ffe3655
)
2018-04-03 16:15:43 +02:00
Gael Guennebaud
3fb42ff7b2 bug #1532: disable stl::*_negate in C++17 (they are deprecated)
(grafted from 407e3e2621
)
2018-04-03 15:59:30 +02:00
Gael Guennebaud
e90a14609a Fix uninitialized output argument.
(grafted from 524119d32a
)
2018-04-03 10:56:10 +02:00
Gael Guennebaud
ece56baba0 Merged in bfierz/eigen/3.3 (pull request PR-345)
Adds missing EIGEN_STRONG_INLINE to support MSVC properly inlining small vector calculations
2018-03-27 07:40:13 +00:00
Gael Guennebaud
1724dae8b8 Add static assertion for fixed sizes Ref<>
(grafted from f7d17689a5
)
2018-03-09 10:11:13 +01:00
Gael Guennebaud
190b46dd1f Implement better static assertion checking to make sure that the first assertion is a static one and not a runtime one.
(grafted from f6be7289d7
)
2018-03-09 10:00:51 +01:00
Gael Guennebaud
74daf12e52 Add static assertion on selfadjoint-view's UpLo parameter.
(grafted from d820ab9edc
)
2018-03-09 09:33:43 +01:00
Gael Guennebaud
c24844195d bug #1517: fix triangular product with unit diagonal and nested scaling factor: (s*A).triangularView<UpperUnit>()*B
(grafted from 5deeb19e7b
)
2018-02-09 16:52:35 +01:00
Gael Guennebaud
15752027ec Fix linear indexing in generic block evaluation.
(grafted from 12efc7d41b
)
2018-02-09 16:45:49 +01:00
Eugene Chereshnev
bfc66e8b9a Fix incorrect ldvt in LAPACKE call from JacobiSVD
(grafted from f558ad2955
)
2018-01-03 12:55:52 -08:00
Gael Guennebaud
b60cbbef37 fix compilation with old compiler 2017-12-15 17:53:48 +01:00
Gael Guennebaud
33b972d8b3 Fix compilation of stableNorm with some expressions as input
(grafted from 06bf1047f9
)
2017-12-15 15:15:37 +01:00
Gael Guennebaud
bb28a2aada fix warning 2017-12-15 14:43:33 +01:00
Gael Guennebaud
acd0ce11aa Fix cmake warning
(grafted from 31e0bda2e3
)
2017-12-14 15:48:27 +01:00
Basil Fierz
01fb621733 Adds missing EIGEN_STRONG_INLINE to support MSVC properly inlining small vector calculations
When working with MSVC often small vector operations are not properly inlined. This behaviour is observed even on the most recent compiler versions.
2017-10-26 22:44:28 +02:00
Benoit Steiner
71d1198ccd Merged in henryiii/eigen/henryiii/device33 (pull request PR-344)
Branch 3.3: Fixing missing inlines on device functions for newer CUDA cards
2017-10-21 01:59:01 +00:00
Henry Schreiner
95ec3232c6 Restore __device__ 2017-10-21 00:48:05 +00:00
Henry Schreiner
243249718b Adding missing inlines for CUDA and ARCH 6 2017-10-20 13:00:23 +00:00
Gael Guennebaud
32a6db0f8c bug #1468 (1/2) : add missing std:: to memcpy
(grafted from 8579195169
)
2017-09-22 09:23:24 +02:00
Gael Guennebaud
6fc0f2be70 Update documentation for aligned_allocator
(grafted from 7ad07fc6f2
)
2017-09-20 10:22:00 +02:00
Gael Guennebaud
70ac6c9230 Add C++11 max_digits10 for half.
(grafted from 9c353dd145
)
2017-09-06 10:22:47 +02:00
Gael Guennebaud
609e425166 Implement true compile-time "if" for apply_rotation_in_the_plane. This fixes a compilation issue for vectorized real type with missing vectorization for complexes, e.g. AVX512.
(grafted from b35d1ce4a5
)
2017-09-06 10:02:49 +02:00
Gael Guennebaud
4ead16cdd6 Fix mixing types in sparse matrix products.
(grafted from 80142362ac
)
2017-09-02 22:50:20 +02:00
Gael Guennebaud
361102f88b Merged in dtrebbien/eigen/patch-1 (pull request PR-312)
Work around a compilation error seen with nvcc V8.0.61
(grafted from fc39d5954b
)
2017-08-22 12:17:37 +00:00
Gael Guennebaud
5d40715db6 Handle min/max/inf/etc issue in cuda_fp16.h directly in test/main.h
(grafted from 304ef29571
)
2017-08-24 11:26:41 +02:00
Gael Guennebaud
e7c065ec71 bug #1462: remove all occurences of the deprecated __CUDACC_VER__ macro by introducing EIGEN_CUDACC_VER 2017-08-24 11:06:47 +02:00
Gael Guennebaud
18868228ad bug #336: improve doc for PlainObjectBase::Map
(grafted from 39864ebe1e
)
2017-08-22 17:18:43 +02:00
Gael Guennebaud
fbb0c510c5 Add missing scalar conversion
(grafted from 600e52fc7f
)
2017-08-22 17:06:57 +02:00
Gael Guennebaud
a8d2459f8e bug #1449: fix redux_3 unit test
(grafted from bc4dae9aeb
)
2017-08-22 15:59:08 +02:00
Gael Guennebaud
9a266e5118 bug #1461: fix compilation of Map<const Quaternion>::x()
(grafted from bc91a2df8b
)
2017-08-22 15:10:42 +02:00
Gael Guennebaud
51e1aa1539 Doc: warn about constness in LLT::solveInPlace
(grafted from b223918ea9
)
2017-08-22 14:12:47 +02:00
Jim Radford
0137ed4f19 LLT: const the arg to solveInPlace() to allow passing .transpose(), .block(), etc.
(grafted from 0c226644d8
)
2017-01-04 14:42:57 -08:00
Jim Radford
9d03711df8 LLT: avoid making a copy when decomposing in place
(grafted from be281e5289
)
2017-01-04 14:43:56 -08:00
Gael Guennebaud
1ca9072b51 Gub 1453: fix Map with non-default inner-stride but no outer-stride.
(grafted from e27f17bf5c
)
2017-08-22 13:27:37 +02:00
Gael Guennebaud
9fd138e2b3 Re-enable hidden doc in LLT
(grafted from 2c3d70d915
)
2017-08-22 12:04:09 +02:00
Gael Guennebaud
55fbf4fedd bug #1456: add perf recommendation for LLT and storage format
(grafted from 21d0a0bcf5
)
2017-08-22 12:46:35 +02:00
Gael Guennebaud
b87875abf8 bug #1455: Cholesky module depends on Jacobi for rank-updates.
(grafted from a6e7a41a55
)
2017-08-22 11:37:32 +02:00
Gael Guennebaud
ac2c97edff bug #1458: fix documentation of LLT and LDLT info() method.
(grafted from e6021cc8cc
)
2017-08-22 11:32:55 +02:00
Gael Guennebaud
292dea7922 Clarify MKL_DIRECT_CALL doc.
(grafted from 2810ba194b
)
2017-08-17 22:12:26 +02:00
Gael Guennebaud
070b5958e0 use MKL's lapacke.h header when using MKL
(grafted from f727844658
)
2017-08-17 21:58:39 +02:00
Gael Guennebaud
3108fbf767 Clarify doc regarding the usage of MKL_DIRECT_CALL
(grafted from 8c858bd891
)
2017-08-17 12:17:45 +02:00
Gael Guennebaud
9df7f3d8e9 Fix support for MKL's BLAS when using MKL_DIRECT_CALL.
(grafted from b95f92843c
)
2017-08-17 12:07:10 +02:00
Gael Guennebaud
782fd81dee Disable BDCSVD preallocation check.
(grafted from d580a90c9a
)
2017-07-20 10:03:54 +02:00
Gael Guennebaud
fa77d71335 Fix lazyness of operator* with CUDA 2017-07-20 09:47:28 +02:00
Gael Guennebaud
3d1795da28 Fix gcc7 warning: Wint-in-bool-context 2017-06-27 14:32:36 +02:00
Gael Guennebaud
d1c2d6683c Fix a gcc7 warning: Wint-in-bool-context
(grafted from b651ce0ffa
)
2017-06-26 09:58:28 +02:00
Christoph Hertzberg
d8cf158e06 Make sure CMAKE_Fortran_COMPILER is set before checking for Fortran functions 2017-06-20 16:31:53 +02:00
Gael Guennebaud
bc837b7975 bug #1436: fix compilation of Jacobi rotations with ARM NEON, some specializations of internal::conj_helper were missing.
(grafted from b240080e64
)
2017-06-15 10:16:30 +02:00
Gael Guennebaud
68e8f2b833 Added tag 3.3.4 for changeset 3dc3a0ea2d 2017-06-15 09:10:26 +02:00
Gael Guennebaud
3dc3a0ea2d bump to 3.3.4 2017-06-15 09:10:20 +02:00
Gael Guennebaud
79120a4c63 Enable Array(EigenBase<>) ctor for compatible scalar types only. This prevents nested arrays to look as being convertible from/to simple arrays.
(grafted from 9fbdf02059
)
2017-06-12 22:30:32 +02:00
Gael Guennebaud
e0412f18fd Fix compilation of streaming nested Array, i.e., cout << Array<Array<>>
(grafted from e43d8fe9d7
)
2017-06-12 22:26:26 +02:00
Gael Guennebaud
40b0c43bda Fix 1x1 case in Solve expression with EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION==RowMajor
(grafted from d9d7bd6d62
)
2017-06-12 22:25:02 +02:00
Gael Guennebaud
72f3e20e74 Fix LeastSquareDiagonalPreconditioner for complexes (issue introduced in previous commit)
(grafted from a7be4cd1b1
)
2017-06-09 11:57:53 +02:00
Gael Guennebaud
676a7a3271 fix compilation in C++98
(grafted from 8640093af1
)
2017-06-09 12:45:01 +02:00
Gael Guennebaud
f843239452 bug #1414: doxygen, add EigenBase to CoreModule
(grafted from 90168c003d
)
2017-06-09 14:01:44 +02:00
Gael Guennebaud
a4ab0c6b6a Fix compilation with some compilers
(grafted from a4fd4233ad
)
2017-06-09 23:02:02 +02:00
Gael Guennebaud
ef955ea8e5 fix tipo
(grafted from 50e09cca0f
)
2017-06-11 15:30:36 +02:00
NeroBurner
8bd392ca0e add cmake-option to enable/disable creation of tests
* * *
disable unsupportet/test when test are disabled
* * *
rename EIGEN_ENABLE_TESTS to BUILD_TESTING
* * *
consider BUILD_TESTING in blas
(grafted from c4fc2611ba
)
2017-01-02 09:09:21 +01:00
Gael Guennebaud
8d2ac85797 fix unit test
(grafted from 26a2c6fc16
)
2017-12-14 15:11:04 +01:00
Gael Guennebaud
6d6e5fcd43 Add possibility to overwrite EIGEN_STRONG_INLINE. 2017-12-14 14:47:38 +01:00
Gael Guennebaud
9c9e90f6db Fix packet and alignment propagation logic of Block<Xpr> expressions. In particular, (A+B).col(j) lost vectorisation.
(grafted from 9c3aed9d48
)
2017-12-14 14:24:33 +01:00
Gael Guennebaud
7ffa27f347 ignore all *build* sub directories
(grafted from 76c7dae600
)
2017-12-14 14:22:14 +01:00
Gael Guennebaud
c20043c8fd bug #1479: fix failure detection in LDLT
(grafted from 672bdc126b
)
2017-11-16 17:55:24 +01:00
Gael Guennebaud
d18877f18d bug #1485: fix linking issue of non template functions
(grafted from 7cc503f9f5
)
2017-11-15 21:33:37 +01:00
Justin Carpentier
02c0cef97f Use col method for column-major matrix
(grafted from a020d9b134
)
2017-10-17 21:51:27 +02:00
Gael Guennebaud
c8e663fe87 bug #1484: restore deleted line for 128 bits long doubles, and improve dispatching logic.
(grafted from 0a1cc73942
)
2017-11-10 10:25:41 +01:00
Gael Guennebaud
7a875acfb0 Fix overflow issues in BDCSVD
(grafted from e8468ea91b
)
2017-11-08 10:24:28 +01:00
Gael Guennebaud
3ec11d8f17 Fix compilation
(grafted from 7713e20fd2
)
2016-12-27 22:04:58 +01:00
Gael Guennebaud
ec067ac5e3 bug #1403: more scalar conversions fixes in BDCSVD
(grafted from 731c8c704d
)
2017-06-09 15:45:49 +02:00
Gael Guennebaud
316969d839 bug #1403: fix implicit scalar type conversion.
(grafted from 1bbcf19029
)
2017-06-09 14:44:02 +02:00
Gael Guennebaud
7a0a9581b5 bug #1405: enable StrictlyLower/StrictlyUpper triangularView as the destination of matrix*matrix products.
(grafted from ba5cab576a
)
2017-06-09 14:38:04 +02:00
Gael Guennebaud
8880be60fa fix compilation of Half in C++98 (issue introduced in previous commit)
(grafted from 26f552c18d
)
2017-06-09 13:36:58 +02:00
Gael Guennebaud
e41713d52e Fix compilation with gcc 4.3 and ARM NEON
(grafted from 1d59ca2458
)
2017-06-09 13:20:52 +02:00
Gael Guennebaud
b69e465d7a bug #1410: fix lvalue propagation of Array/Matrix-Wrapper with a const nested expression.
(grafted from fb1ee04087
)
2017-06-09 13:13:03 +02:00
Joao Rui Leal
0db83fc571 it is now possible to change Umfpack control settings before factorizations; added access to the report functions of Umfpack
(grafted from 95b804c0fe
)
2016-12-19 10:45:59 +00:00
Gael Guennebaud
1ac703f641 bug #1424: add numext::abs specialization for unsigned integer types. 2017-06-09 11:53:49 +02:00
Gael Guennebaud
2c32368642 Add missing std::numeric_limits specialization for half, and complete NumTraits<half>
(grafted from d588822779
)
2017-06-09 11:51:53 +02:00
Gael Guennebaud
db40309e70 bug #1423: fix LSCG\'s Jacobi preconditioner for row-major matrices.
(grafted from 682b2ef17e
)
2017-06-08 15:06:27 +02:00
Gael Guennebaud
e36c1f7501 bug #1435: fix aliasing issue in exressions like: A = C - B*A;
(grafted from 4bbc320468
)
2017-06-08 12:55:25 +02:00
Mmanu Chaturvedi
3aef5c1a2f Specializing numeric_limits For AutoDiffScalar
(grafted from 2971503fed
)
2017-05-23 17:12:36 -04:00
Gael Guennebaud
ab6bb89980 Fix compilation of matrix log with Map as input
(grafted from 26e8f9171e
)
2017-06-07 10:51:23 +02:00
Gael Guennebaud
983ace99d4 bug #1411: fix usage of alignment information in vectorization of quaternion product and conjugate.
(grafted from f2a553fb7b
)
2017-06-07 10:10:30 +02:00
Gael Guennebaud
72fa6775e8 bug #1417: make LinSpace compatible with std::complex
(grafted from 8508db52ab
)
2017-06-06 17:25:56 +02:00
Gael Guennebaud
9f25cdf4f6 Fix dense * sparse-selfadjoint-view product.
(grafted from 891ac03483
)
2017-04-25 13:58:10 +02:00
Gael Guennebaud
6e5edd68d3 Improve mixing of complex and real in the vectorized path of apply_rotation_in_the_plane
(grafted from d9084ac8e1
)
2017-04-14 11:05:13 +02:00
Gael Guennebaud
e8978ffa99 Fix unwanted Real to Scalar to Real conversions in column-pivoting QR.
(grafted from f75dfdda7e
)
2017-04-14 10:34:30 +02:00
Gael Guennebaud
c753fe7cc3 Improve cmake scripts for Pastix and BLAS detection.
(grafted from 0f83aeb6b2
)
2017-04-14 10:22:12 +02:00
Gael Guennebaud
e59e345720 better check array index before using it
(grafted from 89fd0c3881
)
2017-03-15 15:18:03 +01:00
Benoit Jacob
07c2244440 ARM prefetch fixes: Implement prefetch on ARM64. Do not clobber cc on ARM32. 2017-03-15 06:53:35 -04:00
Gael Guennebaud
1865dccd58 bug #1401: fix compilation of "cond ? x : -x" with x an AutoDiffScalar
(grafted from 970ff78294
)
2017-03-08 16:16:53 +01:00
Gael Guennebaud
f2e6ee9687 remove UTF8 symbol
(grafted from 5694315fbb
)
2017-03-07 10:53:47 +01:00
Gael Guennebaud
9219307e13 remove UTF8 symbols
(grafted from e958c2baac
)
2017-03-07 10:47:40 +01:00
Gael Guennebaud
f2e8f96151 bug #1400: fix stableNorm with EIGEN_DONT_ALIGN_STATICALLY
(grafted from 659087b622
)
2017-03-07 10:02:34 +01:00
Gael Guennebaud
faf8af25ed bug #1396: add some missing EIGEN_DEVICE_FUNC
(grafted from 4e98a7b2f0
)
2017-02-28 09:47:38 +01:00
Gael Guennebaud
106ba41c2a Fix typo.
(grafted from 478a9f53be
)
2017-02-28 09:32:45 +01:00
Benoit Steiner
87939ea0dd Added missing EIGEN_DEVICE_FUNC to the SelfCwise binary ops
(grafted from 889c606f8f
)
2017-02-27 17:17:47 -08:00
Benoit Steiner
e813640aa1 Added missing EIGEN_DEVICE_FUNC qualifiers to several nullary op methods.
(grafted from 193939d6aa
)
2017-02-27 17:11:47 -08:00
Benoit Steiner
612b8f2749 Declared the plset, ploadt_ro, and ploaddup packet primitives as usable within a gpu kernel
(grafted from ed4dc9d01a
)
2017-02-27 16:57:01 -08:00
Benoit Steiner
ead8e1b796 Added missing EIGEN_DEVICE_FUNC qualifiers.
(grafted from b1fc7c9a09
)
2017-02-27 16:48:30 -08:00
Benoit Steiner
3d4265f2d5 Added EIGEN_DEVICE_FUNC to make the prototype of the EigenBase override match that of DenseBase
(grafted from 554116bec1
)
2017-02-27 16:45:31 -08:00
Benoit Steiner
d66586ac90 Avoid unecessary float to double conversions.
(grafted from 34d9fce93b
)
2017-02-27 16:33:33 -08:00
Gael Guennebaud
44920624fb Added tag 3.3.3 for changeset 208058b9ad 2017-02-21 14:36:39 +01:00
Gael Guennebaud
208058b9ad bump to 3.3.3 2017-02-21 14:36:34 +01:00
Gael Guennebaud
b4218b8473 Use int32_t instead of int in NEON code. Some platforms with 16 bytes int supports ARM NEON.
(grafted from cbbf88c4d7
)
2017-02-17 14:39:02 +01:00
Gael Guennebaud
3c2f0812f6 bug #1394: fix compilation of SelfAdjointEigenSolver<Matrix>(sparse*sparse);
(grafted from 76687f385c
)
2017-02-20 14:27:26 +01:00
Gael Guennebaud
17bbd82f7d bug #1380: for Map<> as input of matrix exponential
(grafted from d8b1f6cebd
)
2017-02-20 14:06:06 +01:00
Gael Guennebaud
e1385337ff bug #1395: fix the use of compile-time vectors as inputs of JacobiSVD.
(grafted from 6572825703
)
2017-02-20 13:44:37 +01:00
Gael Guennebaud
d367ecb475 Silent warning.
(grafted from a811a04696
)
2017-02-20 10:14:21 +01:00
Gael Guennebaud
c3b658b2c9 Fix tracking of temporaries in unit tests
(grafted from deefa54a54
)
2017-02-19 10:32:54 +01:00
Gael Guennebaud
f9d655a8c8 Fix compilation.
(grafted from f8a55cc062
)
2017-02-18 10:08:13 +01:00
Gael Guennebaud
ad3e4d1a49 bug #1393: enable Matrix/Array explicit ctor from types with conversion operators (was ok with 3.2)
(grafted from 582b5e39bf
)
2017-02-17 14:10:57 +01:00
Gael Guennebaud
222ed66f79 Fix usage of CUDACC_VER 2017-02-20 08:16:54 +01:00
Gael Guennebaud
6bceebfabf bug #1391: include IO.h before DenseBase to enable its usage in DenseBase plugins. 2017-02-13 09:46:20 +01:00
Gael Guennebaud
2ca3eb8407 bug #1392: fix #include <Eigen/Sparse> with mpl2-only
(grafted from c16ee72b20
)
2017-02-11 10:35:01 +01:00
Gael Guennebaud
698205cddf Suppress warning 2017-02-10 21:30:31 +01:00
Gael Guennebaud
2ecb33820f Fix prunning in (sparse*sparse).pruned() when the result is nearly dense.
(grafted from a1ff24f96a
)
2017-02-10 13:59:32 +01:00
Gael Guennebaud
a0de6eb4ce Include clang in the list of non strict MSVC (just to be sure) 2017-02-10 13:41:52 +01:00
Alexander Neumann
7962ac1a58 fixed inlining issue with clang-cl on visual studio 2017-02-08 23:50:38 +01:00
Alexander Neumann
9c97b053f3 fixed compiling issue using clang-cl with visual studio 2017-02-08 23:50:09 +01:00
Gael Guennebaud
f61b0d56f0 Improve multi-threading heuristic for matrix products with a small number of columns.
(grafted from fc8fd5fd24
)
2017-02-07 17:19:59 +01:00
Gael Guennebaud
5087e016eb bug #1389: MSVC's std containers do not properly align in 64 bits mode if the requested alignment is larger than 16 bytes (e.g., with AVX)
(grafted from 4254b3eda3
)
2017-02-03 15:22:35 +01:00
Gael Guennebaud
fa9f5d7170 Fix compilation of JacobiSVD for vectors type
(grafted from 645a8e32a5
)
2017-01-31 16:22:54 +01:00
Gael Guennebaud
6975534cb2 bug #478: fix regression in the eigen decomposition of zero matrices.
(grafted from 53026d29d4
)
2017-01-31 14:22:42 +01:00
Gael Guennebaud
95c6d8db75 bug #1380: fix matrix exponential with Map<>
(grafted from 63de19c000
)
2017-01-30 13:55:27 +01:00
Gael Guennebaud
e0548e9ff3 bug #1384: fix evaluation of "sparse/scalar" that used the wrong evaluation path.
(grafted from c86911ac73
)
2017-01-30 13:38:24 +01:00
Gael Guennebaud
c289ef20f3 bug #1383: fix regression in LinSpaced for integers and high<low
(grafted from 850ca961d2
)
2017-01-25 18:13:53 +01:00
Gael Guennebaud
b8cf157e8c bug #1381: fix sparse.diagonal() used as a rvalue.
The problem was that is "sparse" is not const, then sparse.diagonal() must have the
LValueBit flag meaning that sparse.diagonal().coeff(i) must returns a const reference,
const Scalar&. However, sparse::coeff() cannot returns a reference for a non-existing
zero coefficient. The trick is to return a reference to a local member of
evaluator<SparseMatrix>.
(grafted from 296d24be4d
)
2017-01-25 17:39:01 +01:00
Gael Guennebaud
b4d2b404b0 bug #1383: Fix regression from 3.2 with LinSpaced(n,0,n-1) with n==0.
(grafted from d06a48959a
)
2017-01-25 15:27:13 +01:00
Gael Guennebaud
70fcaf9bd8 bug #1365: fix another type mismatch warning
(sync is set from and compared to an Index)
2016-12-28 23:35:43 +01:00
Gael Guennebaud
2f31c6b1d8 bug #1369: fix type mismatch warning.
Returned values of omp thread id and numbers are int,
o let's use int instead of Index here.
(grafted from 97812ff0d3
)
2016-12-28 23:29:35 +01:00
Gael Guennebaud
9e55467b4c bug #1375: fix cmake installation with cmake 2.8
(grafted from 156e6234f1
)
2017-01-24 09:16:40 +01:00
Gael Guennebaud
35bf99c63e bug #1376: add missing assertion on size mismatch with compound assignment operators (e.g., mat += mat.col(j))
(grafted from ba3f977946
)
2017-01-23 22:06:08 +01:00
Gael Guennebaud
f9b8729597 bug #1382: move using std::size_t/ptrdiff_t to Eigen's namespace (still better than the global namespace!)
(grafted from b0db4eff36
)
2017-01-23 22:03:57 +01:00
Gael Guennebaud
4b2e7f26aa Add std:: namespace prefix to all (hopefully) instances if size_t/ptrdfiff_t 2017-01-23 22:02:53 +01:00
Gael Guennebaud
5202bc92e6 Use Index instead of size_t
(grafted from 4b607b5692
)
2017-01-23 22:00:33 +01:00
Gael Guennebaud
9d83411cc4 bug #1379: fix compilation in sparse*diagonal*dense with openmp
(grafted from 0fe278f7be
)
2017-01-21 23:27:01 +01:00
Gael Guennebaud
556c03a09d bug #1378: fix doc (DiagonalIndex vs Diagonal)
(grafted from 22a172751e
)
2017-01-21 22:09:59 +01:00
Gael Guennebaud
ce463b9fa4 Added tag 3.3.2 for changeset 477d1e8192 2017-01-18 15:06:46 +01:00
Gael Guennebaud
477d1e8192 Bump to 3.3.2 2017-01-18 15:06:40 +01:00
Gael Guennebaud
0eaff8fdf2 Defer set-to-zero in triangular = product so that no aliasing issue occur in the common:
A.triangularView() = B*A.sefladjointView()*B.adjoint()
case that used to work in 3.2.
(grafted from 655ba783f8
)
2017-01-17 18:03:35 +01:00
Gael Guennebaud
582c96691b Fix typo 2017-01-16 13:36:56 +01:00
Gael Guennebaud
0b22158d9f Add missing doc of SparseView
(grafted from 831fffe874
)
2017-01-06 18:01:29 +01:00
Gael Guennebaud
dafdb0d8a8 MSVC 2015 has all we want about c++11 and MSVC 2017 fails on binder1st/binder2nd
(grafted from e383d6159a
)
2017-01-06 15:44:13 +01:00
Gael Guennebaud
1d1686c62b Convert integers to real numbers when computing relative L2 error
(grafted from f3f026c9aa
)
2017-01-05 13:36:08 +01:00
Gael Guennebaud
ad95b924d0 Fix and workaround several doxygen issues/warnings
(grafted from 2299717fd5
)
2017-01-04 23:27:33 +01:00
Gael Guennebaud
9499684320 Add doc for sparse triangular solve functions
(grafted from ee6f7f6c0c
)
2017-01-04 23:10:36 +01:00
Gael Guennebaud
5b6a31626b Add missing snippet files.
(grafted from 5165de97a4
)
2017-01-04 23:08:27 +01:00
Gael Guennebaud
bc3fee2d8e bug #1336: workaround doxygen failing to include numerous members of MatriBase in Matrix
(grafted from a0a36ad0ef
)
2017-01-04 22:02:39 +01:00
Gael Guennebaud
eaa9223277 Document selfadjointView
(grafted from 29a1a58113
)
2017-01-04 22:01:50 +01:00
Gael Guennebaud
c9ba1165e7 bug #1336: fix doxygen issue regarding EIGEN_CWISE_BINARY_RETURN_TYPE
(grafted from a5ebc92f8d
)
2017-01-04 18:21:44 +01:00
Gael Guennebaud
dd2d5d67ff bug #1370: add doc for StorageIndex
(grafted from 8702562177
)
2017-01-03 11:25:41 +01:00
Gael Guennebaud
404322b64f bug #1370: rename _Index to _StorageIndex in SparseMatrix, and add a warning in the doc regarding the 3.2 to 3.3 change of SparseMatrix::Index
(grafted from 575c078759
)
2017-01-03 11:19:14 +01:00
Marco Falke
ce37bae2cd doc: Fix trivial typo in AsciiQuickReference.txt
* * *
fixup!
(grafted from 4ebf69394d
)
2017-01-01 13:25:48 +00:00
Gael Guennebaud
3900dbc341 Make sure that traits<CwiseBinaryOp>::Flags reports the correct storage order so that methods like .outerSize()/.innerSize() work properly.
(grafted from d32a43e33a
)
2016-12-27 16:35:45 +01:00
Gael Guennebaud
5f586c2bd0 Add missing .outer() member to iterators of evaluators of cwise sparse binary expression
(grafted from 7136267461
)
2016-12-27 16:34:30 +01:00
Gael Guennebaud
215f88a417 Fix check of storage order mismatch for "sparse cwiseop sparse".
(grafted from fe0ee72390
)
2016-12-27 16:33:19 +01:00
Gael Guennebaud
2257f40f4a Merged in angelos_m/eigen/3.3 (pull request PR-269)
Remove superfluous const's (can cause warnings on some Intel compilers)
2016-12-21 08:53:16 +01:00
Gael Guennebaud
9e0fa0ef6d Fix bug #1367: compilation fix for gcc 4.1!
(grafted from 94e8d8902f
)
2016-12-20 22:17:01 +01:00
Gael Guennebaud
0fddbf3dc7 Add transpose, adjoint, conjugate methods to SelfAdjointView (useful to write generic code)
(grafted from 684cfc762d
)
2016-12-20 16:33:53 +01:00
Gael Guennebaud
eda635bd58 Make sure that HyperPlane::transform manitains a unit normal vector in the Affine case.
(grafted from f5d644b415
)
2016-12-20 09:35:00 +01:00
Benoit Jacob
26197bb467 Use 32 registers on ARM64 2016-12-19 13:44:46 -05:00
Gael Guennebaud
772e59d475 bug #1360: fix sign issue with pmull on altivec
(grafted from 8c0e701504
)
2016-12-18 22:13:19 +00:00
Gael Guennebaud
e8f83cbb5d Fix unused warning
(grafted from fc94258e77
)
2016-12-18 22:11:48 +00:00
Gael Guennebaud
dce584d799 bug #1363: fix mingw's ABI issue
(grafted from 5d00fdf0e8
)
2016-12-15 11:58:31 +01:00
Gael Guennebaud
0bcef9557d bug #1358: fix compilation for sparse += sparse.selfadjointView();
(grafted from 11b492e993
)
2016-12-14 17:53:47 +01:00
Gael Guennebaud
2b3c876b2a bug #1359: fix compilation of col_major_sparse.row() *= scalar
(used to work in 3.2.9 though the expression is not really writable)
(grafted from e67397bfa7
)
2016-12-14 17:05:26 +01:00
Gael Guennebaud
a05f6aad0e bug #1359: fix sparse /=scalar and *=scalar implementation.
InnerIterators must be obtained from an evaluator.
(grafted from 98d7458275
)
2016-12-14 17:03:13 +01:00
Gael Guennebaud
59187285e1 bug #1361: fix compilation issue in mat=perm.inverse()
(grafted from c817ce3ba3
)
2016-12-13 23:10:27 +01:00
Angelos Mantzaflaris
1dd074ea7e Merged eigen/eigen/3.3 into 3.3 2016-12-07 01:01:50 +01:00
Angelos Mantzaflaris
24fa7a01bd merge 2016-12-07 00:43:55 +01:00
Angelos Mantzaflaris
e236d3443c Remove superfluous const's (can cause warnings on some Intel compilers) 2016-12-07 00:37:48 +01:00
Gael Guennebaud
4ec8833220 Added tag 3.3.1 for changeset dd3685cc6a 2016-12-06 11:44:02 +01:00
Gael Guennebaud
dd3685cc6a Bump to 3.3.1 2016-12-06 11:43:58 +01:00
Gael Guennebaud
487a6e6515 Explain how to choose your favorite Eigen version
(grafted from 0c4d05b009
)
2016-12-06 11:34:06 +01:00
Silvio Traversaro
75f0b8aae3 Added relocatable cmake support also for CMake before 3.0 and after 2.8.8
(grafted from e049a2a72a
)
2016-12-06 10:37:34 +01:00
Gael Guennebaud
23aca8a586 Optimize SparseLU::solve for rhs vectors
(grafted from 8640ffac65
)
2016-12-05 15:41:14 +01:00
Gael Guennebaud
28bf2bf070 remove temporary in SparseLU::solve
(grafted from 62acd67903
)
2016-12-05 15:11:57 +01:00
Silvio Traversaro
0164f4c682 Make CMake config file relocatable
(grafted from 18481b518f
)
2016-12-05 10:39:52 +01:00
Gael Guennebaud
bbff608a42 Merged in angelos_m/eigen/3.3 (pull request PR-264)
add explicit template to numext::abs2 and fix signed/unsigned warning
2016-12-05 21:56:01 +00:00
Gael Guennebaud
ea56d2ff2c Fix memory leak in Ref<Sparse>
(grafted from a6b971e291
)
2016-12-05 16:59:30 +01:00
Gael Guennebaud
a4c8701e9a bug #1356: fix calls to evaluator::coeffRef(0,0) to get the address of the destination
by adding a dstDataPtr() member to the kernel. This fixes undefined behavior if dst is empty (nullptr).
(grafted from 0db6d5b3f4
)
2016-12-05 15:08:09 +01:00
Gael Guennebaud
a9bb9796e0 Ease compiler job to generate clean and efficient code in mat*vec.
(grafted from 66f65ccc36
)
2016-12-02 22:41:26 +01:00
Gael Guennebaud
449883be74 Operators += and -= do not resize!
(grafted from fe696022ec
)
2016-12-02 22:40:25 +01:00
Angelos Mantzaflaris
0a08d4c60b use numext::abs 2016-12-02 11:48:06 +01:00
Angelos Mantzaflaris
4086187e49 1. Add explicit template to abs2 (resolves deduction for some arithmetic types)
2. Avoid signed-unsigned conversion in comparison (warning in case Scalar is unsigned)
2016-12-02 11:39:18 +01:00
Christoph Hertzberg
91864f85d3 bug #1355: Fixed wrong line-endings on two files
(grafted from 22f7d398e2
)
2016-12-02 11:22:05 +01:00
Gael Guennebaud
c3597106ab Merged in angelos_m/eigen/3.3 (pull request PR-263)
fix two warnings(unused typedef, unused variable) and a typo
2016-12-02 09:02:39 +00:00
Gael Guennebaud
aed1d6597f Clean up SparseCore module regarding ReverseInnerIterator
(grafted from 27873008d4
)
2016-12-01 21:55:10 +01:00
Angelos Mantzaflaris
b6f04a2dd4 typo UIntPtr 2016-12-01 21:25:58 +01:00
Angelos Mantzaflaris
a9aa3bcf50 fix two warnings(unused typedef, unused variable) and a typo 2016-12-01 21:23:43 +01:00
Gael Guennebaud
32b8da66e3 fix member order
(grafted from 181138a1cb
)
2016-12-01 17:06:20 +01:00
Gael Guennebaud
eb94179ea3 Merged in sergiu/eigen/cmake-imported-target (pull request PR-257)
CMake imported target (take #2)
2016-12-01 15:13:48 +00:00
Gael Guennebaud
52a7386aef Fix misleading-indentation warnings.
(grafted from 037b46762d
)
2016-12-01 16:05:42 +01:00
Gael Guennebaud
8cada1d894 Fix slection of product implementation for dynamic size matrices with fixed max size.
(grafted from 8df272af88
)
2016-11-30 22:21:33 +01:00
Gael Guennebaud
6e4a664c42 Fix a performance regression in (mat*mat)*vec for which mat*mat was evaluated multiple times.
(grafted from c927af60ed
)
2016-11-30 17:59:13 +01:00
Gael Guennebaud
1cd1a96d56 bug #1351: fix compilation of random with old compilers
(grafted from ab4ef5e66e
)
2016-11-30 17:37:53 +01:00
Sergiu Deitsch
86ab00cdcf cmake: remove architecture dependency from Eigen3ConfigVersion.cmake
Also, install Eigen3*.cmake under $prefix/share/eigen3/cmake by default.
2016-11-30 15:46:46 +01:00
Sergiu Deitsch
65f09be8d2 doc: mention the NO_MODULE option and target availability 2016-11-30 15:41:38 +01:00
Gael Guennebaud
400d756b82 bug #1348: Document EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES,
and reflect in the doc that EIGEN_DONT_ALIGN* are deprecated.
(grafted from 21d0286d81
)
2016-11-23 22:15:03 +01:00
Gael Guennebaud
9d31798a84 update cdash project for 3.3 2016-11-23 14:13:08 +01:00
Gael Guennebaud
723ed92e0e Fix compilation with gcc and old ABI version
(grafted from e340866c81
)
2016-11-23 14:04:57 +01:00
Gael Guennebaud
0a7de0b273 Fix compilation issue with MSVC:
MSVC always messes up with shadowed template arguments, for instance in:
  struct B { typedef float T; }
  template<typename T> struct A : B {
    T g;
  };
The type of A<double>::g will be float and not double.
(grafted from a91de27e98
)
2016-11-23 12:24:48 +01:00
Gael Guennebaud
d6b9bc1ccd Optimize predux<Packet8f> (AVX)
(grafted from 74637fa4e3
)
2016-11-22 21:57:52 +01:00
Gael Guennebaud
0eff51e2ed Disable usage of SSE3 _mm_hadd_ps that is extremely slow.
(grafted from 178c084856
)
2016-11-22 21:53:14 +01:00
Gael Guennebaud
1b7dd46d94 Optimize predux<Packet4d> (AVX)
(grafted from 7dd894e40e
)
2016-11-22 21:41:30 +01:00
Gael Guennebaud
b2eb1bf3dc Disable usage of SSE3 haddpd that is extremely slow.
(grafted from f3fb0a1940
)
2016-11-22 16:58:31 +01:00
Gael Guennebaud
fe48c25682 Revert vec/y to vec*(1/y) in row-major TRSM:
- div is extremely costly
- this is consistent with the column-major case
- this is consistent with all other BLAS implementations
(grafted from eb621413c1
)
2016-12-06 15:04:50 +01:00
Gael Guennebaud
0ba6da3470 Fix BLAS backend for symmetric rank K updates.
(grafted from 8365c2c941
)
2016-12-06 14:47:09 +01:00
Sergiu Deitsch
a287140f72 cmake: added Eigen3::Eigen imported target 2016-11-22 12:25:06 +01:00
Gael Guennebaud
4d89ec8a00 Fix regression in assigment of sparse block to spasre block.
(grafted from 6a84246a6a
)
2016-11-21 21:46:42 +01:00
Chun Wang
441760f239 Workaround for error in VS2012 with /clr
(grafted from 0d0948c3b9
)
2016-11-17 17:54:27 -05:00
Gael Guennebaud
664162fb8a Fix compilation issue in mat = permutation (regression introduced in 8193ffb3d3
)
(grafted from 465ede0f20
)
2016-11-20 09:41:37 +01:00
Gael Guennebaud
aa3c761002 bug #1343: fix compilation regression in mat+=selfadjoint_view.
Generic EigenBase2EigenBase assignment was incomplete.
(grafted from 8193ffb3d3
)
2016-11-18 10:17:34 +01:00
Gael Guennebaud
94f2cfc9c7 bug #1343: fix compilation regression in array = matrix_product
(grafted from cebff7e3a2
)
2016-11-18 10:09:33 +01:00
Konstantinos Margaritis
4a13d79df6 replace sizeof(Packet) with PacketSize else it breaks for ZVector.Packet4f
(grafted from a1d5c503fa
)
2016-11-17 13:27:45 -05:00
Konstantinos Margaritis
463176cc44 implement float/std::complex<float> for ZVector as well, minor fixes to ZVector
(grafted from 672aa97d4d
)
2016-11-17 13:27:33 -05:00
Gael Guennebaud
5aab97fba6 Optimize sparse<bool> && sparse<bool> to use the same path as for coeff-wise products.
(grafted from 0ee92aa38e
)
2016-11-14 18:47:41 +01:00
Gael Guennebaud
89abc6806d bug #426: move operator && and || to MatrixBase and SparseMatrixBase.
(grafted from 2e334f5da0
)
2016-11-14 18:47:02 +01:00
Niels Ole Salscheider
baf793ebaa Make sure not to call numext::maxi on expression templates
(grafted from 51fef87408
)
2016-11-12 12:20:57 +01:00
Gael Guennebaud
b4ddafcfac Fix regression in SparseMatrix::ReverseInnerIterator
(grafted from eedb87f4ba
)
2016-11-14 14:05:53 +01:00
Gael Guennebaud
1079967710 Added tag 3.3.0 for changeset eeac81b8c0 2016-11-10 13:57:29 +01:00
Gael Guennebaud
eeac81b8c0 bump to 3.3.0 2016-11-10 13:55:14 +01:00
Gael Guennebaud
e80bc2ddb0 Fix printing of sparse expressions 2016-11-10 10:35:32 +01:00
Benoit Steiner
db3903498d Merged in benoitsteiner/opencl (pull request PR-246)
Improved support for OpenCL
2016-11-08 22:28:44 +00:00
Benoit Steiner
dcc14bee64 Fixed the formatting of the code 2016-11-08 14:24:46 -08:00
Benoit Steiner
b88c1117d4 Fixed the indentation of the cmake file 2016-11-08 14:22:36 -08:00
Luke Iwanski
912cb3d660 #if EIGEN_EXCEPTION -> #ifdef EIGEN_EXCEPTIONS. 2016-11-08 22:01:14 +00:00
Luke Iwanski
1b345b0895 Fix for SYCL queue initialisation. 2016-11-08 21:56:31 +00:00
Luke Iwanski
1b95717358 Use try/catch only when exceptions are enabled. 2016-11-08 21:08:53 +00:00
Mehdi Goli
d57430dd73 Converting all sycl buffers to uninitialised device only buffers; adding memcpyHostToDevice and memcpyDeviceToHost on syclDevice; modifying all examples to obey the new rules; moving sycl queue creating to the device based on Benoit suggestion; removing the sycl specefic condition for returning m_result in TensorReduction.h according to Benoit suggestion. 2016-11-08 17:08:02 +00:00
Gael Guennebaud
73985ead27 Extend unit test to check sparse solvers with a SparseVector as the rhs and result. 2016-11-06 20:29:57 +01:00
Gael Guennebaud
436a111792 Generalize Cholmod support to hanlde any sparse type as the rhs and result of the solve method 2016-11-06 20:29:23 +01:00
Gael Guennebaud
afc55b1885 Generalize IterativeSolverBase::solve to hanlde any sparse type as the results (instead of SparseMatrix only) 2016-11-06 20:28:18 +01:00
Gael Guennebaud
a5c2d8a3cc Generalize solve_sparse_through_dense_panels to handle SparseVector. 2016-11-06 15:20:58 +01:00
Gael Guennebaud
f8bfe10613 Add missing friend declaration 2016-11-06 15:20:30 +01:00
Gael Guennebaud
fc7180cda8 Add a default ctor to evaluator<SparseVector>.
Needed for evaluator<Solve>.
2016-11-06 15:20:00 +01:00
Gael Guennebaud
4d226ab5b5 Enable swapping between SparseMatrix and SparseVector 2016-11-06 15:15:03 +01:00
Benoit Steiner
ad086b03e4 Removed unnecessary statement 2016-11-05 12:43:27 -07:00
Benoit Steiner
dad177be01 Added missing includes 2016-11-05 10:04:42 -07:00
Gael Guennebaud
55b4fd1d40 Extend mpreal unit test to check LLT with complexes. 2016-11-05 11:28:53 +01:00
Gael Guennebaud
a354c3ca59 Fix compilation of LLT with complex<mpreal>. 2016-11-05 11:28:29 +01:00
Benoit Steiner
d46a36cc84 Merged eigen/eigen into default 2016-11-04 18:22:55 -07:00
Mehdi Goli
0ebe3808ca Removed the sycl include from Eigen/Core and moved it to Unsupported/Eigen/CXX11/Tensor; added TensorReduction for sycl (full reduction and partial reduction); added TensorReduction test case for sycl (full reduction and partial reduction); fixed the tile size on TensorSyclRun.h based on the device max work group size; 2016-11-04 18:18:19 +00:00
Gael Guennebaud
47d1b4a609 Added tag 3.3-rc2 for changeset ba05572dcb 2016-11-04 09:09:18 +01:00
Gael Guennebaud
ba05572dcb bump to 3.3-rc2 2016-11-04 09:09:06 +01:00
Benoit Steiner
5c3995769c Improved AVX512 configuration 2016-11-03 04:50:28 -07:00
Benoit Steiner
fbe672d599 Reenable the generation of dynamic blas libraries. 2016-11-03 04:08:43 -07:00
Benoit Steiner
ca0ba0d9a4 Improved AVX512 support 2016-11-03 04:00:49 -07:00
Benoit Steiner
c80587c92b Merged eigen/eigen into default 2016-11-03 03:55:11 -07:00
Gael Guennebaud
3f1d0cdc22 bug #1337: improve doc of homogeneous() and hnormalized() 2016-11-03 11:03:08 +01:00
Gael Guennebaud
78e93ac1ad bug #1330: Cholmod supports double precision only, so let's trigger a static assertion if the scalar type does not match this requirement. 2016-11-03 10:21:59 +01:00
Benoit Steiner
3e37166d0b Merged in benoitsteiner/opencl (pull request PR-244)
Disable vectorization on device only when compiling for sycl
2016-11-02 22:01:03 +00:00
Benoit Steiner
0585b2965d Disable vectorization on device only when compiling for sycl 2016-11-02 11:44:27 -07:00
Benoit Steiner
e6e77ed08b Don't call lgamma_r when compiling for an Apple device, since the function isn't available on MacOS 2016-11-02 09:55:39 -07:00
Benoit Steiner
b238f387b4 Pulled latest updates from trunk 2016-11-02 08:53:13 -07:00
Benoit Steiner
c8db17301e Special functions require math.h: make sure it is included. 2016-11-02 08:51:52 -07:00
Gael Guennebaud
a07bb428df bug #1004: improve accuracy of LinSpaced for abs(low) >> abs(high). 2016-11-02 11:34:38 +01:00
Gael Guennebaud
598de8b193 Add pinsertfirst function and implement pinsertlast for complex on SSE/AVX. 2016-11-02 10:38:13 +01:00
Benoit Steiner
e44519744e Merged in benoitsteiner/opencl (pull request PR-243)
Fixed the ambiguity in callig make_tuple for sycl backend.
2016-11-02 02:56:58 +00:00
Rasmus Munk Larsen
0a6ae41555 Merged eigen/eigen into default 2016-11-01 15:37:00 -07:00
Rasmus Munk Larsen
b730952414 Don't attempts to use lgamma_r for CUDA devices.
Fix type in lgamma_impl<double>.
2016-11-01 15:34:19 -07:00
Benoit Steiner
7a0e96b80d Gate the code that refers to cuda fp16 primitives more thoroughly 2016-11-01 12:08:09 -07:00
Mehdi Goli
51af6ae971 Fixed the ambiguity in callig make_tuple for sycl backend. 2016-10-31 16:35:51 +00:00
Benoit Steiner
0a9ad6fc72 Worked around Visual Studio compilation errors 2016-10-28 07:54:27 -07:00
Benoit Steiner
d5f88e2357 Sharded the tensor_image_patch test to help it run on low power devices 2016-10-27 21:48:21 -07:00
Benoit Steiner
0b4b0f11e8 Fixed a few more compilation warnings 2016-10-28 04:01:01 +00:00
Benoit Steiner
306daa24a3 Fixed a compilation warning 2016-10-28 03:50:31 +00:00
Benoit Steiner
8471cf1996 Fixed compilation warning 2016-10-28 03:46:08 +00:00
Benoit Steiner
b0c5bfdf78 Added missing template parameters 2016-10-28 03:43:41 +00:00
Rasmus Munk Larsen
2ebb314fa7 Use threadsafe versions of lgamma and lgammaf if possible. 2016-10-27 16:17:12 -07:00
Gael Guennebaud
530f20c21a Workaround MSVC issue. 2016-10-27 21:51:37 +02:00
Gael Guennebaud
c3ce4f9ac0 Merged in enricodetoma/eigen (pull request PR-241)
Always enable /bigobj for tests to avoid a compile error in MSVC 2015
2016-10-27 19:21:28 +00:00
Benoit Steiner
7d64e6752c Pulled latest updates from trunk 2016-10-26 18:48:06 -07:00
Benoit Steiner
0a4c4d40b4 Removed a template parameter for fixed sized tensors 2016-10-26 18:47:37 -07:00
Gael Guennebaud
3ecb343dc3 Fix regression in X = (X*X.transpose())/s with X rectangular by deferring resizing of the destination after the creation of the evaluator of the source expression. 2016-10-26 22:50:41 +02:00
enrico.detoma
6ed571744b Always enable /bigobj for tests to avoid a compile error in MSVC 2015 2016-10-26 22:48:46 +02:00
Gael Guennebaud
97feea9d39 add a generic EIGEN_HAS_CXX11 2016-10-26 15:53:13 +02:00
Gael Guennebaud
ca6a2a5248 Fix warning with ICC 2016-10-26 14:13:05 +02:00
Benoit Steiner
5f2dd503ff Replaced tabs with spaces 2016-10-25 20:40:58 -07:00
Benoit Steiner
1644bafe29 Code cleanup 2016-10-25 20:36:14 -07:00
Gael Guennebaud
b15a5dc3f4 Fix ICC warnings 2016-10-25 22:20:24 +02:00
Gael Guennebaud
aad72f3c6d Add missing inline keywords 2016-10-25 20:20:09 +02:00
Benoit Steiner
3e194a6a73 Fixed a typo 2016-10-25 08:42:15 -07:00
Gael Guennebaud
58146be99b bug #1004: one more rewrite of LinSpaced for floating point numbers to guarantee both interpolation and monotonicity.
This version simply does low+i*step plus a branch to return high if i==size-1.
Vectorization is accomplished with a branch and the help of pinsertlast.
Some quick benchmark revealed that the overhead is really marginal, even when filling small vectors.
2016-10-25 16:53:09 +02:00
Gael Guennebaud
13fc18d3a2 Add a pinsertlast function replacing the last entry of a packet by a scalar.
(useful to vectorize LinSpaced)
2016-10-25 16:48:49 +02:00
Gael Guennebaud
2634f9386c bug #1333: fix bad usage of const_cast_derived. Better use .data() for that purpose. 2016-10-24 22:22:35 +02:00
Gael Guennebaud
9e8f07d7b5 Cleanup ArrayWrapper and MatrixWrapper by removing redundant accessors. 2016-10-24 22:16:48 +02:00
Gael Guennebaud
b027d7a8cf bug #1004: remove the inaccurate "sequential" path for LinSpaced, mark respective function as deprecated, and enforce strict interpolation of the higher range using a correction term.
Now, even with floating point precision, both the 'low' and 'high' bounds are exactly reproduced at i=0 and i=size-1 respectively.
2016-10-24 20:27:21 +02:00
Benoit Steiner
b11aab5fcc Merged in benoitsteiner/opencl (pull request PR-238)
Added support for OpenCL to the Tensor Module
2016-10-24 15:30:45 +00:00
Gael Guennebaud
53c77061f0 bug #698: rewrite LinSpaced for integer scalar types to avoid overflow and guarantee an even spacing when possible.
Otherwise, the "high" bound is implicitly lowered to the largest value allowing for an even distribution.
This changeset also disable vectorization for this integer path.
2016-10-24 15:50:27 +02:00
Gael Guennebaud
e8e56c7642 Add unit test for overflow in LinSpaced 2016-10-24 15:43:51 +02:00
Gael Guennebaud
40f62974b7 bug #1328: workaround a compilation issue with gcc 4.2 2016-10-20 19:19:37 +02:00
Benoit Steiner
cf20b30d65 Merge latest updates from trunk 2016-10-20 09:42:05 -07:00
Luke Iwanski
03b63e182c Added SYCL include in Tensor. 2016-10-20 15:32:44 +01:00
Benoit Steiner
d3943cd50c Fixed a few typos in the ternary tensor expressions types 2016-10-19 12:56:12 -07:00
Mehdi Goli
8fb162fc85 Fixing the typo regarding missing #if needed for proper handling of exceptions in Eigen/Core. 2016-10-16 12:52:34 +01:00
Mehdi Goli
e36cb91c99 Fixing the code indentation in the TensorReduction.h file. 2016-10-14 18:03:00 +01:00
Luke Iwanski
2e188dd4d4 Merged ComputeCpp to default. 2016-10-14 16:47:40 +01:00
Mehdi Goli
15380f9a87 Applyiing Benoit's comment to return the missing line back in Eigen/Core 2016-10-14 16:39:41 +01:00
Gael Guennebaud
692b30ca95 Fix previous merge. 2016-10-14 17:16:28 +02:00
Gael Guennebaud
050c681bdd Merged in rmlarsen/eigen2 (pull request PR-232)
Improve performance of parallelized matrix multiply for rectangular matrices
2016-10-14 14:51:09 +00:00
Luke Iwanski
e742da8b28 Merged ComputeCpp into default. 2016-10-14 13:36:51 +01:00
Mehdi Goli
524fa4c46f Reducing the code by generalising sycl backend functions/structs. 2016-10-14 12:09:55 +01:00
Benoit Steiner
737e4152c3 Merged in lukier/eigen (pull request PR-234)
Enabling CUDA in Geometry
2016-10-13 18:09:28 +00:00
Benoit Steiner
d0ee2267d6 Relaxed the resizing checks so that they don't fail with gcc >= 5.3 2016-10-13 10:59:46 -07:00
Robert Lukierski
a94791b69a Fixes for min and abs after Benoit's comments, switched to numext. 2016-10-13 15:00:22 +01:00
Avi Ginsburg
ac63d6891c Patch to allow VS2015 & CUDA 8.0 to compile with Eigen included. I'm not sure
whether to limit the check to this compiler combination
(` || (EIGEN_COMP_MSVC == 1900 &&  __CUDACC_VER__) `)
or to leave it as it is. I also don't know if this will have any affect on
including Eigen in device code (I'm not in my current project).
2016-10-13 08:47:32 +00:00
Benoit Steiner
7e4a6754b2 Merged eigen/eigen into default 2016-10-12 22:42:33 -07:00
Benoit Steiner
38b6048e14 Deleted redundant implementation of predux 2016-10-12 14:37:56 -07:00
Gael Guennebaud
e74612b9a0 Remove double ;; 2016-10-12 22:49:47 +02:00
Benoit Steiner
78d2926508 Merged eigen/eigen into default 2016-10-12 13:46:29 -07:00
Benoit Steiner
2e2f48e30e Take advantage of AVX512 instructions whenever possible to speedup the processing of 16 bit floats. 2016-10-12 13:45:39 -07:00
Gael Guennebaud
f939c351cb Fix SPQR for rectangular matrices 2016-10-12 22:39:33 +02:00
Gael Guennebaud
091d373ee9 Fix outer-stride. 2016-10-12 21:47:52 +02:00
Robert Lukierski
471075f7ad Fixes min() warnings. 2016-10-12 18:59:05 +01:00
Gael Guennebaud
5c366fe1d7 Merged in rmlarsen/eigen (pull request PR-230)
Fix a bug in psqrt for SSE and AVX when EIGEN_FAST_MATH=1
2016-10-12 16:30:51 +00:00
Robert Lukierski
86711497c4 Adding EIGEN_DEVICE_FUNC in the Geometry module.
Additional CUDA necessary fixes in the Core (mostly usage of
EIGEN_USING_STD_MATH).
2016-10-12 16:35:17 +01:00
Rasmus Munk Larsen
47150af1c8 Fix copy-paste error: Must use _mm256_cmp_ps for AVX. 2016-10-12 08:34:39 -07:00
Gael Guennebaud
89e315152c bug #1325: fix compilation on NEON with clang 2016-10-12 16:55:47 +02:00
Benoit Steiner
7f0599b6eb Manually define int16_t and uint16_t when compiling with Visual Studio 2016-10-08 22:56:32 -07:00
Benoit Steiner
5727e4d89c Reenabled the use of variadic templates on tegra x1 provides that the latest version (i.e. JetPack 2.3) is used. 2016-10-08 22:19:03 +00:00
Benoit Steiner
5266ff8966 Cleaned up a regression test 2016-10-08 19:12:44 +00:00
Benoit Steiner
5c68051cd7 Merge the content of the ComputeCpp branch into the default branch 2016-10-07 11:04:16 -07:00
Gael Guennebaud
4860727ac2 Remove static qualifier of free-functions (inline is enough and this helps ICC to find the right overload) 2016-10-07 09:21:12 +02:00
Benoit Steiner
507b661106 Renamed predux_half into predux_downto4 2016-10-06 17:57:04 -07:00
Benoit Steiner
a498ff7df6 Fixed incorrect comment 2016-10-06 15:27:27 -07:00
Benoit Steiner
8ba3c41fcf Revergted unecessary change 2016-10-06 15:12:15 -07:00
Benoit Steiner
a7473d6d5a Fixed compilation error with gcc >= 5.3 2016-10-06 14:33:22 -07:00
Benoit Steiner
5e64cea896 Silenced a compilation warning 2016-10-06 14:24:17 -07:00
Benoit Steiner
33fba3f08d Merged in rryan/eigen/tensorfunctors (pull request PR-233)
Fully support complex types in SumReducer and MeanReducer when building for CUDA by using scalar_sum_op and scalar_product_op instead of operator+ and operator*.
2016-10-06 12:29:19 -07:00
RJ Ryan
bfc264abe8 Add a test that GPU complex product reductions match CPU reductions. 2016-10-06 11:10:14 -07:00
RJ Ryan
e2e9cdd169 Fully support complex types in SumReducer and MeanReducer when building for CUDA by using scalar_sum_op and scalar_product_op instead of operator+ and operator*. 2016-10-06 10:49:48 -07:00
Benoit Steiner
d485d12c51 Added missing AVX intrinsics for fp16: in particular, implemented predux which is required by the matrix-vector code. 2016-10-06 10:41:03 -07:00
Rasmus Munk Larsen
48c635e223 Add a simple cost model to prevent Eigen's parallel GEMM from using too many threads when the inner dimension is small.
Timing for square matrices is unchanged, but both CPU and Wall time are significantly improved for skinny matrices. The benchmarks below are for multiplying NxK * KxN matrices with test names of the form BM_OuterishProd/N/K.

Improvements in Wall time:

Run on [redacted] (12 X 3501 MHz CPUs); 2016-10-05T17:40:02.462497196-07:00
CPU: Intel Haswell with HyperThreading (6 cores) dL1:32KB dL2:256KB dL3:15MB
Benchmark                          Base (ns)  New (ns) Improvement
------------------------------------------------------------------
BM_OuterishProd/64/1                    3088      1610    +47.9%
BM_OuterishProd/64/4                    3562      2414    +32.2%
BM_OuterishProd/64/32                   8861      7815    +11.8%
BM_OuterishProd/128/1                  11363      6504    +42.8%
BM_OuterishProd/128/4                  11128      9794    +12.0%
BM_OuterishProd/128/64                 27691     27396     +1.1%
BM_OuterishProd/256/1                  33214     28123    +15.3%
BM_OuterishProd/256/4                  34312     36818     -7.3%
BM_OuterishProd/256/128               174866    176398     -0.9%
BM_OuterishProd/512/1                7963684    104224    +98.7%
BM_OuterishProd/512/4                7987913    112867    +98.6%
BM_OuterishProd/512/256              8198378   1306500    +84.1%
BM_OuterishProd/1k/1                 7356256    324432    +95.6%
BM_OuterishProd/1k/4                 8129616    331621    +95.9%
BM_OuterishProd/1k/512              27265418   7517538    +72.4%

Improvements in CPU time:

Run on [redacted] (12 X 3501 MHz CPUs); 2016-10-05T17:40:02.462497196-07:00
CPU: Intel Haswell with HyperThreading (6 cores) dL1:32KB dL2:256KB dL3:15MB
Benchmark                          Base (ns)  New (ns) Improvement
------------------------------------------------------------------
BM_OuterishProd/64/1                    6169      1608    +73.9%
BM_OuterishProd/64/4                    7117      2412    +66.1%
BM_OuterishProd/64/32                  17702     15616    +11.8%
BM_OuterishProd/128/1                  45415      6498    +85.7%
BM_OuterishProd/128/4                  44459      9786    +78.0%
BM_OuterishProd/128/64                110657    109489     +1.1%
BM_OuterishProd/256/1                 265158     28101    +89.4%
BM_OuterishProd/256/4                 274234    183885    +32.9%
BM_OuterishProd/256/128              1397160   1408776     -0.8%
BM_OuterishProd/512/1               78947048    520703    +99.3%
BM_OuterishProd/512/4               86955578   1349742    +98.4%
BM_OuterishProd/512/256             74701613  15584661    +79.1%
BM_OuterishProd/1k/1                78352601   3877911    +95.1%
BM_OuterishProd/1k/4                78521643   3966221    +94.9%
BM_OuterishProd/1k/512              258104736  89480530    +65.3%
2016-10-06 10:33:10 -07:00
Benoit Steiner
9f3276981c Enabling AVX512 should also enable AVX2. 2016-10-06 10:29:48 -07:00
Gael Guennebaud
80b5133789 Fix compilation of qr.inverse() for column and full pivoting variants. 2016-10-06 09:55:50 +02:00
Benoit Steiner
4131074818 Deleted unecessary CMakeLists.txt file 2016-10-05 18:54:35 -07:00
Benoit Steiner
cb5cd69872 Silenced a compilation warning. 2016-10-05 18:50:53 -07:00
Benoit Steiner
78b569f685 Merged latest updates from trunk 2016-10-05 18:48:55 -07:00
Benoit Steiner
9c2b6c049b Silenced a few compilation warnings 2016-10-05 18:37:31 -07:00
Benoit Steiner
6f3cd529af Pulled latest updates from trunk 2016-10-05 18:31:43 -07:00
Benoit Steiner
d7f9679a34 Fixed a couple of compilation warnings 2016-10-05 15:00:32 -07:00
Benoit Steiner
ae1385c7e4 Pull the latest updates from trunk 2016-10-05 14:54:36 -07:00
Benoit Steiner
73b0012945 Fixed compilation warnings 2016-10-05 14:24:24 -07:00
Benoit Steiner
c84084c0c0 Fixed compilation warning 2016-10-05 14:15:41 -07:00
Benoit Steiner
4387433acf Increased the robustness of the reduction tests on fp16 2016-10-05 10:42:41 -07:00
Benoit Steiner
aad20d700d Increase the tolerance to numerical noise. 2016-10-05 10:39:24 -07:00
Benoit Steiner
8b69d5d730 ::rand() returns a signed integer on win32 2016-10-05 08:55:02 -07:00
Benoit Steiner
ed7a220b04 Fixed a typo that impacts windows builds 2016-10-05 08:51:31 -07:00
Benoit Steiner
ceee1c008b Silenced compilation warning 2016-10-04 18:47:53 -07:00
Benoit Steiner
698ff69450 Properly characterize the CUDA packet primitives for fp16 as device only 2016-10-04 16:53:30 -07:00
Rasmus Munk Larsen
7f67e6dfdb Update comment for fast sqrt. 2016-10-04 15:09:11 -07:00
Rasmus Munk Larsen
765615609d Update comment for fast sqrt. 2016-10-04 15:08:41 -07:00
Rasmus Munk Larsen
3ed67cb0bb Fix a bug in the implementation of Carmack's fast sqrt algorithm in Eigen (enabled by EIGEN_FAST_MATH), which causes the vectorized parts of the computation to return -0.0 instead of NaN for negative arguments.
Benchmark speed in Giga-sqrts/s
Intel(R) Xeon(R) CPU E5-1650 v3 @ 3.50GHz
-----------------------------------------
                    SSE        AVX
Fast=1              2.529G     4.380G
Fast=0              1.944G     1.898G
Fast=1 fixed        2.214G     3.739G

This table illustrates the worst case in terms speed impact: It was measured by repeatedly computing the sqrt of an n=4096 float vector that fits in L1 cache. For large vectors the operation becomes memory bound and the differences between the different versions almost negligible.
2016-10-04 14:22:56 -07:00
Benoit Steiner
6af5ac7e27 Cleanup the cuda executor code. 2016-10-04 08:52:13 -07:00
Benoit Steiner
2f6d1607c8 Cleaned up the random number generation code. 2016-10-04 08:38:23 -07:00
Benoit Steiner
881b90e984 Use explicit type casting to generate packets of zeros. 2016-10-04 08:23:38 -07:00
Benoit Steiner
616a7a1912 Improved support for compiling CUDA code with clang as the host compiler 2016-10-03 17:09:33 -07:00
Benoit Steiner
409e887d78 Added support for constand std::complex numbers on GPU 2016-10-03 11:06:24 -07:00
Gael Guennebaud
9d6d0dff8f bug #1317: fix performance regression with some Block expressions and clang by helping it to remove dead code.
The trick is to get rid of the nested expression in the evaluator by copying only the required information (here, the strides).
2016-10-01 15:37:00 +02:00
Gael Guennebaud
8b84801f7f bug #1310: workaround a compilation regression from 3.2 regarding triangular * homogeneous 2016-09-30 22:49:59 +02:00
Benoit Steiner
422530946f Renamed the SYCL tests to follow the standard naming convention. 2016-09-30 08:22:10 -07:00
Gael Guennebaud
67b4f45836 Fix angle range 2016-09-30 12:46:33 +02:00
Gael Guennebaud
27f3970453 Remove std:: prefix 2016-09-30 12:40:41 +02:00
Gael Guennebaud
3860a0bc8f bug #1312: Quaternion to AxisAngle conversion now ensures the angle will be in the range [-pi,pi]. This also increases accuracy when q.w is negative. 2016-09-29 23:23:35 +02:00
Gael Guennebaud
33500050c3 bug #1308: fix compilation of some small products involving nullary-expressions. 2016-09-29 09:40:44 +02:00
Benoit Steiner
27d7628f16 Updated the list of warnings to reflect the new message ids introduced in cuda 8.0 2016-09-28 17:42:59 -07:00
Benoit Steiner
2bda1b0d93 Updated the tensor sum and mean reducer to enable them to process complex numbers on cuda gpus. 2016-09-28 17:08:41 -07:00
Mehdi Goli
dd602e62c8 Converting alias template to nested struct in order to be compatible with CXX-03 2016-09-27 16:21:19 +01:00
Gael Guennebaud
f3a00dd2b5 Merged in sergiu/eigen (pull request PR-229)
Disabled MSVC level 4 warning C4714
2016-09-27 09:28:08 +02:00
Gael Guennebaud
892afb9416 Add debug info. 2016-09-26 23:53:57 +02:00
Gael Guennebaud
779774f98c bug #1311: fix alignment logic in some cases of (scalar*small).lazyProduct(small) 2016-09-26 23:53:40 +02:00
Benoit Steiner
6565f8d60f Made the initialization of a CUDA device thread safe. 2016-09-26 11:00:32 -07:00
Gael Guennebaud
48dfe98abd bug #1308: fix compilation of vector * rowvector::nullary. 2016-09-25 14:54:35 +02:00
Sergiu Deitsch
fe29157d02 disabled MSVC level 4 warning C4714
The level 4 warning (/W4) warns about functions marked as __forceinline not
inlined, and generates a lot of noise.
2016-09-25 14:25:47 +02:00
Benoit Steiner
f6ac51a054 Made TensorEvalTo compatible with c++0x again. 2016-09-23 16:45:17 -07:00
Benoit Steiner
00d4e65f00 Deleted unused TensorMap data member 2016-09-23 16:44:45 -07:00
Gael Guennebaud
86caba838d bug #1304: fix Projective * scaling and Projective *= scaling 2016-09-23 13:41:21 +02:00
Gael Guennebaud
b9f7a17e47 Add missing file. 2016-09-23 10:26:08 +02:00
Benoit Steiner
1301d744f8 Made the gaussian generator usable on GPU 2016-09-22 19:04:44 -07:00
Benoit Steiner
2a69290ddb Added a specialization of Eigen::numext::real and Eigen::numext::imag for std::complex<T> to be used when compiling a cuda kernel. This is unfortunately necessary to be able to process complex numbers from a CUDA kernel on MacOS. 2016-09-22 15:52:23 -07:00
Gael Guennebaud
3946768916 Added tag 3.3-rc1 for changeset 77e27fbeee 2016-09-22 22:38:36 +02:00
Luke Iwanski
c771df6bc3 Updated the owners of the file. 2016-09-19 14:09:25 +01:00
Luke Iwanski
b91e021172 Merged with default. 2016-09-19 14:03:54 +01:00
Luke Iwanski
cb81975714 Partial OpenCL support via SYCL compatible with ComputeCpp CE. 2016-09-19 12:44:13 +01:00
Benoit Steiner
f899e08946 Enabled a number of tests previously disabled by mistake 2016-05-03 14:07:47 -07:00
Benoit Steiner
4c05fb03a3 Merged eigen/eigen into default 2016-05-03 13:15:00 -07:00
Benoit Steiner
577a07a86e Re-enabled the product_small test now that everything compiles correctly. 2016-05-03 13:11:38 -07:00
Benoit Steiner
3b8da4be5a Extended the packetmath test to cover all the alignments made possible by avx512 instructions. 2016-04-29 14:13:43 -07:00
Benoit Steiner
2f28ccbea3 Update the makefile to make the tests compile with gcc 4.9 2016-04-29 14:11:09 -07:00
Benoit Steiner
7a4bd337d9 Resolved merge conflict 2016-04-29 13:42:22 -07:00
Benoit Steiner
07a247dcf4 Pulled latest updates from upstream 2016-04-29 13:41:26 -07:00
Benoit Steiner
fa5a8f055a Implemented palign_impl for AVX512 2016-04-29 13:30:13 -07:00
Benoit Steiner
ef3ac9d05a Fixed the AVX512 packet traits 2016-04-29 13:28:36 -07:00
Benoit Steiner
d7b75e8d86 Added pdiv packet primitives for avx512 2016-04-29 13:26:47 -07:00
Benoit Steiner
5e89ded685 Implemented preduxp for AVX512 2016-04-29 13:00:33 -07:00
Benoit Steiner
5f85662ad8 Implemented the pabs and preverse primitives for avx512. 2016-04-29 12:53:34 -07:00
Benoit Steiner
d37ee89ca8 Disabled some of the AVX512 primitives on compilers that don't support them 2016-04-29 12:50:29 -07:00
Benoit Steiner
8bfe739cd2 Updated the AVX512 PacketMath to properly leverage the AVX512DQ instructions 2016-04-11 18:40:16 -07:00
Benoit Steiner
d6e596174d Pull latest updates from upstream 2016-04-11 17:20:17 -07:00
Benoit Steiner
3ca1ae2bb7 Commented out the version of pexp<Packet8d> since it fails to compile with gcc 5.3 2016-02-04 13:49:06 -08:00
Benoit Steiner
23f69ab936 Added implementations of pexp, plog, psqrt, and prsqrt optimized for AVX512 2016-02-04 10:36:36 -08:00
Benoit Steiner
6c9cf117c1 Fixed indentation 2016-02-04 10:34:10 -08:00
Benoit Steiner
d93b71a301 Updated the packetmath test to call predux_half instead of predux4 2016-02-01 15:18:33 -08:00
Benoit Steiner
ef66f2887b Updated the matrix multiplication code to make it compile with AVX512 enabled. 2016-02-01 14:38:05 -08:00
Benoit Steiner
85b6d82b49 Generalized predux4 to support AVX512 packets, and renamed it predux_half.
Disabled the implementation of pabs for avx512 since the corresponding intrinsics are not shipped with gcc
2016-02-01 14:35:51 -08:00
Benoit Steiner
c1a42c2d0d Don't disable the AVX implementations of plset when compiling with AVX512 enabled 2016-01-14 17:21:39 -08:00
Benoit Steiner
0366478df8 Added alignment requirement to the AVX512 packet traits. 2016-01-14 17:02:39 -08:00
Benoit Steiner
3cfd16f3af Fixed the signature of the plset primitives for AVX512 2016-01-14 16:58:01 -08:00
Benoit Steiner
67f44365ea Fixed the AVX512 signature of the ptranspose primitives 2016-01-14 16:51:11 -08:00
Benoit Steiner
a282eb1363 pscatter/pgather use Index instead of int to specify the stride 2016-01-14 16:39:39 -08:00
Benoit Steiner
7832485575 Deleted unnecessary commas and semicolons 2016-01-14 16:36:29 -08:00
Benoit Steiner
99093c0fe0 Added support for AVX512 to the build files 2016-01-05 10:02:49 -08:00
Benoit Steiner
9f9d8d2f62 Disabled part of the matrix matrix peeling code that's incompatible with 512 bit registers 2015-12-21 13:04:52 -08:00
Benoit Steiner
b74887d5f2 Implemented most of the packet primitives for AVX512 2015-12-21 11:46:36 -08:00
Benoit Steiner
6ffb208c77 Make sure EIGEN_HAS_MM_MALLOC is set to 1 when using the avx512 instruction set. 2015-12-21 11:23:15 -08:00
Benoit Steiner
994d1c60b9 Free memory allocated using posix_memalign() with free() instead of std::free() 2015-12-21 11:21:39 -08:00
Benoit Steiner
b8861b0c25 Make sure the data is aligned on a 64 byte boundary when using avx512 instructions. 2015-12-11 09:19:57 -08:00
Benoit Steiner
9a415fb1e2 Preliminary support for AVX512 2015-12-10 15:34:57 -08:00
330 changed files with 16034 additions and 4303 deletions

View File

@@ -13,7 +13,7 @@ core
core.*
*.bak
*~
build*
*build*
*.moc.*
*.moc
ui_*

View File

@@ -41,10 +41,13 @@ string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen_minor_
set(EIGEN_MINOR_VERSION "${CMAKE_MATCH_1}")
set(EIGEN_VERSION_NUMBER ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})
# if the mercurial program is absent, this will leave the EIGEN_HG_CHANGESET string empty,
# but won't stop CMake.
execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT)
execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT)
# if we are not in a mercurial clone
if(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.hg)
# if the mercurial program is absent or this will leave the EIGEN_HG_CHANGESET string empty,
# but won't stop CMake.
execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT)
execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT)
endif()
# if this is the default (aka development) branch, extract the mercurial changeset number from the hg tip output...
if(EIGEN_BRANCH_OUTPUT MATCHES "default")
@@ -133,7 +136,6 @@ if(NOT MSVC)
if(COMPILER_SUPPORT_WERROR)
set(CMAKE_REQUIRED_FLAGS "-Werror")
endif()
ei_add_cxx_compiler_flag("-pedantic")
ei_add_cxx_compiler_flag("-Wall")
ei_add_cxx_compiler_flag("-Wextra")
@@ -231,6 +233,12 @@ if(NOT MSVC)
message(STATUS "Enabling FMA in tests/examples")
endif()
option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF)
if(EIGEN_TEST_AVX512)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -fabi-version=6 -DEIGEN_ENABLE_AVX512")
message(STATUS "Enabling AVX512 in tests/examples")
endif()
option(EIGEN_TEST_F16C "Enable/Disable F16C in tests/examples" OFF)
if(EIGEN_TEST_F16C)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mf16c")
@@ -375,7 +383,7 @@ else()
)
endif()
set(CMAKEPACKAGE_INSTALL_DIR
"${CMAKE_INSTALL_LIBDIR}/cmake/eigen3"
"${CMAKE_INSTALL_DATADIR}/eigen3/cmake"
CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen3Config.cmake is installed"
)
set(PKGCONFIG_INSTALL_DIR
@@ -411,16 +419,15 @@ add_subdirectory(Eigen)
add_subdirectory(doc EXCLUDE_FROM_ALL)
include(EigenConfigureTesting)
option(BUILD_TESTING "Enable creation of Eigen tests." ON)
if(BUILD_TESTING)
include(EigenConfigureTesting)
# fixme, not sure this line is still needed:
enable_testing() # must be called from the root CMakeLists, see man page
if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest
else()
add_subdirectory(test EXCLUDE_FROM_ALL)
if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest
else()
add_subdirectory(test EXCLUDE_FROM_ALL)
endif()
endif()
if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
@@ -431,6 +438,13 @@ else()
add_subdirectory(lapack EXCLUDE_FROM_ALL)
endif()
# add SYCL
option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
if(EIGEN_TEST_SYCL)
set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}")
include(FindComputeCpp)
endif()
add_subdirectory(unsupported)
add_subdirectory(demos EXCLUDE_FROM_ALL)
@@ -449,7 +463,9 @@ endif(NOT WIN32)
configure_file(scripts/cdashtesting.cmake.in cdashtesting.cmake @ONLY)
ei_testing_print_summary()
if(BUILD_TESTING)
ei_testing_print_summary()
endif()
message(STATUS "")
message(STATUS "Configured Eigen ${EIGEN_VERSION_NUMBER}")
@@ -495,18 +511,89 @@ set ( EIGEN_VERSION_MINOR ${EIGEN_MAJOR_VERSION} )
set ( EIGEN_VERSION_PATCH ${EIGEN_MINOR_VERSION} )
set ( EIGEN_DEFINITIONS "")
set ( EIGEN_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${INCLUDE_INSTALL_DIR}" )
set ( EIGEN_INCLUDE_DIRS ${EIGEN_INCLUDE_DIR} )
set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} )
configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
@ONLY ESCAPE_QUOTES
)
# Interface libraries require at least CMake 3.0
if (NOT CMAKE_VERSION VERSION_LESS 3.0)
include (CMakePackageConfigHelpers)
# Imported target support
add_library (eigen INTERFACE)
target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS})
target_include_directories (eigen INTERFACE
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
$<INSTALL_INTERFACE:${INCLUDE_INSTALL_DIR}>
)
# Export as title case Eigen
set_target_properties (eigen PROPERTIES EXPORT_NAME Eigen)
install (TARGETS eigen EXPORT Eigen3Targets)
configure_package_config_file (
${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
PATH_VARS EIGEN_INCLUDE_DIR EIGEN_ROOT_DIR
INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
)
# Remove CMAKE_SIZEOF_VOID_P from Eigen3ConfigVersion.cmake since Eigen does
# not depend on architecture specific settings or libraries. More
# specifically, an Eigen3Config.cmake generated from a 64 bit target can be
# used for 32 bit targets as well (and vice versa).
set (_Eigen3_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P})
unset (CMAKE_SIZEOF_VOID_P)
write_basic_package_version_file (Eigen3ConfigVersion.cmake
VERSION ${EIGEN_VERSION_NUMBER}
COMPATIBILITY SameMajorVersion)
set (CMAKE_SIZEOF_VOID_P ${_Eigen3_CMAKE_SIZEOF_VOID_P})
# The Eigen target will be located in the Eigen3 namespace. Other CMake
# targets can refer to it using Eigen3::Eigen.
export (TARGETS eigen NAMESPACE Eigen3:: FILE Eigen3Targets.cmake)
# Export Eigen3 package to CMake registry such that it can be easily found by
# CMake even if it has not been installed to a standard directory.
export (PACKAGE Eigen3)
install (EXPORT Eigen3Targets NAMESPACE Eigen3:: DESTINATION ${CMAKEPACKAGE_INSTALL_DIR})
else (NOT CMAKE_VERSION VERSION_LESS 3.0)
# Fallback to legacy Eigen3Config.cmake without the imported target
# If CMakePackageConfigHelpers module is available (CMake >= 2.8.8)
# create a relocatable Config file, otherwise leave the hardcoded paths
include(CMakePackageConfigHelpers OPTIONAL RESULT_VARIABLE CPCH_PATH)
if(CPCH_PATH)
configure_package_config_file (
${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3ConfigLegacy.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
PATH_VARS EIGEN_INCLUDE_DIR EIGEN_ROOT_DIR
INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
)
else()
# The PACKAGE_* variables are defined by the configure_package_config_file
# but without it we define them manually to the hardcoded paths
set(PACKAGE_INIT "")
set(PACKAGE_EIGEN_INCLUDE_DIR ${EIGEN_INCLUDE_DIR})
set(PACKAGE_EIGEN_ROOT_DIR ${EIGEN_ROOT_DIR})
configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3ConfigLegacy.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
@ONLY ESCAPE_QUOTES )
endif()
write_basic_package_version_file( Eigen3ConfigVersion.cmake
VERSION ${EIGEN_VERSION_NUMBER}
COMPATIBILITY SameMajorVersion )
endif (NOT CMAKE_VERSION VERSION_LESS 3.0)
install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
)
${CMAKE_CURRENT_BINARY_DIR}/Eigen3ConfigVersion.cmake
DESTINATION ${CMAKEPACKAGE_INSTALL_DIR} )
# Add uninstall target
add_custom_target ( uninstall

View File

@@ -4,14 +4,10 @@
## # The following are required to uses Dart and the Cdash dashboard
## ENABLE_TESTING()
## INCLUDE(CTest)
set(CTEST_PROJECT_NAME "Eigen")
set(CTEST_PROJECT_NAME "Eigen 3.3")
set(CTEST_NIGHTLY_START_TIME "00:00:00 UTC")
set(CTEST_DROP_METHOD "http")
set(CTEST_DROP_SITE "manao.inria.fr")
set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen")
set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen+3.3")
set(CTEST_DROP_SITE_CDASH TRUE)
set(CTEST_PROJECT_SUBPROJECTS
Official
Unsupported
)

View File

@@ -9,6 +9,7 @@
#define EIGEN_CHOLESKY_MODULE_H
#include "Core"
#include "Jacobi"
#include "src/Core/util/DisableStupidWarnings.h"
@@ -31,7 +32,11 @@
#include "src/Cholesky/LLT.h"
#include "src/Cholesky/LDLT.h"
#ifdef EIGEN_USE_LAPACKE
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h"
#endif
#include "src/Cholesky/LLT_LAPACKE.h"
#endif

View File

@@ -14,9 +14,25 @@
// first thing Eigen does: stop the compiler from committing suicide
#include "src/Core/util/DisableStupidWarnings.h"
// Handle NVCC/CUDA
#ifdef __CUDACC__
// Do not try asserts on CUDA!
#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
#define EIGEN_CUDACC __CUDACC__
#endif
#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA)
#define EIGEN_CUDA_ARCH __CUDA_ARCH__
#endif
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
#elif defined(__CUDACC_VER__)
#define EIGEN_CUDACC_VER __CUDACC_VER__
#else
#define EIGEN_CUDACC_VER 0
#endif
// Handle NVCC/CUDA/SYCL
#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__)
// Do not try asserts on CUDA and SYCL!
#ifndef EIGEN_NO_DEBUG
#define EIGEN_NO_DEBUG
#endif
@@ -25,17 +41,24 @@
#undef EIGEN_INTERNAL_DEBUGGING
#endif
// Do not try to vectorize on CUDA!
#ifndef EIGEN_DONT_VECTORIZE
#define EIGEN_DONT_VECTORIZE
#endif
#ifdef EIGEN_EXCEPTIONS
#undef EIGEN_EXCEPTIONS
#endif
// All functions callable from CUDA code must be qualified with __device__
#define EIGEN_DEVICE_FUNC __host__ __device__
#ifdef __CUDACC__
// Do not try to vectorize on CUDA and SYCL!
#ifndef EIGEN_DONT_VECTORIZE
#define EIGEN_DONT_VECTORIZE
#endif
#define EIGEN_DEVICE_FUNC __host__ __device__
// We need math_functions.hpp to ensure that that EIGEN_USING_STD_MATH macro
// works properly on the device side
#include <math_functions.hpp>
#else
#define EIGEN_DEVICE_FUNC
#endif
#else
#define EIGEN_DEVICE_FUNC
@@ -51,7 +74,7 @@
#define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
#endif
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS)
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
#define EIGEN_EXCEPTIONS
#endif
@@ -140,6 +163,18 @@
#ifdef __FMA__
#define EIGEN_VECTORIZE_FMA
#endif
#if defined(__AVX512F__) && defined(EIGEN_ENABLE_AVX512)
#define EIGEN_VECTORIZE_AVX512
#define EIGEN_VECTORIZE_AVX2
#define EIGEN_VECTORIZE_AVX
#define EIGEN_VECTORIZE_FMA
#ifdef __AVX512DQ__
#define EIGEN_VECTORIZE_AVX512DQ
#endif
#ifdef __AVX512ER__
#define EIGEN_VECTORIZE_AVX512ER
#endif
#endif
// include files
@@ -171,7 +206,7 @@
#ifdef EIGEN_VECTORIZE_SSE4_2
#include <nmmintrin.h>
#endif
#ifdef EIGEN_VECTORIZE_AVX
#if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512)
#include <immintrin.h>
#endif
#endif
@@ -213,7 +248,7 @@
#if defined __CUDACC__
#define EIGEN_VECTORIZE_CUDA
#include <vector_types.h>
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
#if EIGEN_CUDACC_VER >= 70500
#define EIGEN_HAS_CUDA_FP16
#endif
#endif
@@ -271,7 +306,9 @@
namespace Eigen {
inline static const char *SimdInstructionSetsInUse(void) {
#if defined(EIGEN_VECTORIZE_AVX)
#if defined(EIGEN_VECTORIZE_AVX512)
return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
#elif defined(EIGEN_VECTORIZE_AVX)
return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
#elif defined(EIGEN_VECTORIZE_SSE4_2)
return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
@@ -303,12 +340,16 @@ inline static const char *SimdInstructionSetsInUse(void) {
#error Eigen2-support is only available up to version 3.2. Please go to "http://eigen.tuxfamily.org/index.php?title=Eigen2" for further information
#endif
namespace Eigen {
// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
// ensure QNX/QCC support
using std::size_t;
// gcc 4.6.0 wants std:: for ptrdiff_t
using std::ptrdiff_t;
}
/** \defgroup Core_Module Core module
* This is the main module of Eigen providing dense matrix and vector support
* (both fixed and dynamic size) with all the features corresponding to a BLAS library
@@ -330,8 +371,14 @@ using std::ptrdiff_t;
#include "src/Core/MathFunctions.h"
#include "src/Core/GenericPacketMath.h"
#include "src/Core/MathFunctionsImpl.h"
#include "src/Core/arch/Default/ConjHelper.h"
#if defined EIGEN_VECTORIZE_AVX
#if defined EIGEN_VECTORIZE_AVX512
#include "src/Core/arch/SSE/PacketMath.h"
#include "src/Core/arch/AVX/PacketMath.h"
#include "src/Core/arch/AVX512/PacketMath.h"
#include "src/Core/arch/AVX512/MathFunctions.h"
#elif defined EIGEN_VECTORIZE_AVX
// Use AVX for floats and doubles, SSE for integers
#include "src/Core/arch/SSE/PacketMath.h"
#include "src/Core/arch/SSE/Complex.h"
@@ -340,6 +387,7 @@ using std::ptrdiff_t;
#include "src/Core/arch/AVX/MathFunctions.h"
#include "src/Core/arch/AVX/Complex.h"
#include "src/Core/arch/AVX/TypeCasting.h"
#include "src/Core/arch/SSE/TypeCasting.h"
#elif defined EIGEN_VECTORIZE_SSE
#include "src/Core/arch/SSE/PacketMath.h"
#include "src/Core/arch/SSE/MathFunctions.h"
@@ -359,7 +407,6 @@ using std::ptrdiff_t;
#include "src/Core/arch/ZVector/Complex.h"
#endif
#include "src/Core/arch/CUDA/Complex.h"
// Half float support
#include "src/Core/arch/CUDA/Half.h"
#include "src/Core/arch/CUDA/PacketMathHalf.h"
@@ -379,6 +426,11 @@ using std::ptrdiff_t;
#include "src/Core/functors/StlFunctors.h"
#include "src/Core/functors/AssignmentFunctors.h"
// Specialized functors to enable the processing of complex numbers
// on CUDA devices
#include "src/Core/arch/CUDA/Complex.h"
#include "src/Core/IO.h"
#include "src/Core/DenseCoeffsBase.h"
#include "src/Core/DenseBase.h"
#include "src/Core/MatrixBase.h"
@@ -426,7 +478,6 @@ using std::ptrdiff_t;
#include "src/Core/Redux.h"
#include "src/Core/Visitor.h"
#include "src/Core/Fuzzy.h"
#include "src/Core/IO.h"
#include "src/Core/Swap.h"
#include "src/Core/CommaInitializer.h"
#include "src/Core/GeneralProduct.h"

View File

@@ -45,7 +45,11 @@
#include "src/Eigenvalues/GeneralizedEigenSolver.h"
#include "src/Eigenvalues/MatrixBaseEigenvalues.h"
#ifdef EIGEN_USE_LAPACKE
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h"
#endif
#include "src/Eigenvalues/RealSchur_LAPACKE.h"
#include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
#include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"

View File

@@ -28,7 +28,11 @@
#include "src/LU/FullPivLU.h"
#include "src/LU/PartialPivLU.h"
#ifdef EIGEN_USE_LAPACKE
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h"
#endif
#include "src/LU/PartialPivLU_LAPACKE.h"
#endif
#include "src/LU/Determinant.h"

View File

@@ -36,7 +36,11 @@
#include "src/QR/ColPivHouseholderQR.h"
#include "src/QR/CompleteOrthogonalDecomposition.h"
#ifdef EIGEN_USE_LAPACKE
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h"
#endif
#include "src/QR/HouseholderQR_LAPACKE.h"
#include "src/QR/ColPivHouseholderQR_LAPACKE.h"
#endif

View File

@@ -14,7 +14,7 @@
#include "src/Core/util/DisableStupidWarnings.h"
void *qMalloc(size_t size)
void *qMalloc(std::size_t size)
{
return Eigen::internal::aligned_malloc(size);
}
@@ -24,10 +24,10 @@ void qFree(void *ptr)
Eigen::internal::aligned_free(ptr);
}
void *qRealloc(void *ptr, size_t size)
void *qRealloc(void *ptr, std::size_t size)
{
void* newPtr = Eigen::internal::aligned_malloc(size);
memcpy(newPtr, ptr, size);
std::memcpy(newPtr, ptr, size);
Eigen::internal::aligned_free(ptr);
return newPtr;
}

View File

@@ -37,7 +37,11 @@
#include "src/SVD/JacobiSVD.h"
#include "src/SVD/BDCSVD.h"
#if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h"
#endif
#include "src/SVD/JacobiSVD_LAPACKE.h"
#endif

View File

@@ -25,7 +25,9 @@
#include "SparseCore"
#include "OrderingMethods"
#ifndef EIGEN_MPL2_ONLY
#include "SparseCholesky"
#endif
#include "SparseLU"
#include "SparseQR"
#include "IterativeLinearSolvers"

View File

@@ -14,7 +14,7 @@
#include "Core"
#include <deque>
#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */
#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */
#define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...)

View File

@@ -13,7 +13,7 @@
#include "Core"
#include <list>
#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */
#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */
#define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...)

View File

@@ -14,7 +14,7 @@
#include "Core"
#include <vector>
#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */
#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */
#define EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(...)

View File

@@ -248,7 +248,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
/** \brief Reports whether previous computation was successful.
*
* \returns \c Success if computation was succesful,
* \c NumericalIssue if the matrix.appears to be negative.
* \c NumericalIssue if the factorization failed because of a zero pivot.
*/
ComputationInfo info() const
{
@@ -376,6 +376,8 @@ template<> struct ldlt_inplace<Lower>
if((rs>0) && pivot_is_valid)
A21 /= realAkk;
else if(rs>0)
ret = ret && (A21.array()==Scalar(0)).all();
if(found_zero_pivot && pivot_is_valid) ret = false; // factorization failed
else if(!pivot_is_valid) found_zero_pivot = true;
@@ -568,13 +570,14 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons
// more precisely, use pseudo-inverse of D (see bug 241)
using std::abs;
const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
// In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon
// as motivated by LAPACK's xGELSS:
// In some previous versions, tolerance was set to the max of 1/highest (or rather numeric_limits::min())
// and the maximal diagonal entry * epsilon as motivated by LAPACK's xGELSS:
// RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
// However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
// diagonal element is not well justified and leads to numerical issues in some cases.
// Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
// Using numeric_limits::min() gives us more robustness to denormals.
RealScalar tolerance = (std::numeric_limits<RealScalar>::min)();
for (Index i = 0; i < vecD.size(); ++i)
{

View File

@@ -24,7 +24,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
*
* \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
* \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
* The other triangular part won't be read.
* The other triangular part won't be read.
*
* This class performs a LL^T Cholesky decomposition of a symmetric, positive definite
* matrix A such that A = LL^* = U^*U, where L is lower triangular.
@@ -41,14 +41,18 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
* Example: \include LLT_example.cpp
* Output: \verbinclude LLT_example.out
*
* \b Performance: for best performance, it is recommended to use a column-major storage format
* with the Lower triangular part (the default), or, equivalently, a row-major storage format
* with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization
* step, and rank-updates can be up to 3 times slower.
*
* This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
*
* Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered.
* Therefore, the strict lower part does not have to store correct values.
*
* \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
*/
/* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
* Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
* the strict lower part does not have to store correct values.
*/
template<typename _MatrixType, int _UpLo> class LLT
{
public:
@@ -146,7 +150,7 @@ template<typename _MatrixType, int _UpLo> class LLT
}
template<typename Derived>
void solveInPlace(MatrixBase<Derived> &bAndX) const;
void solveInPlace(const MatrixBase<Derived> &bAndX) const;
template<typename InputType>
LLT& compute(const EigenBase<InputType>& matrix);
@@ -177,7 +181,7 @@ template<typename _MatrixType, int _UpLo> class LLT
/** \brief Reports whether previous computation was successful.
*
* \returns \c Success if computation was succesful,
* \c NumericalIssue if the matrix.appears to be negative.
* \c NumericalIssue if the matrix.appears not to be positive definite.
*/
ComputationInfo info() const
{
@@ -351,7 +355,7 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower>
Index ret;
if((ret=unblocked(A11))>=0) return k+ret;
if(rs>0) A11.adjoint().template triangularView<Upper>().template solveInPlace<OnTheRight>(A21);
if(rs>0) A22.template selfadjointView<Lower>().rankUpdate(A21,-1); // bottleneck
if(rs>0) A22.template selfadjointView<Lower>().rankUpdate(A21,typename NumTraits<RealScalar>::Literal(-1)); // bottleneck
}
return -1;
}
@@ -425,7 +429,8 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>
eigen_assert(a.rows()==a.cols());
const Index size = a.rows();
m_matrix.resize(size, size);
m_matrix = a.derived();
if (!internal::is_same_dense(m_matrix, a.derived()))
m_matrix = a.derived();
// Compute matrix L1 norm = max abs column sum.
m_l1_norm = RealScalar(0);
@@ -485,11 +490,14 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
*
* This version avoids a copy when the right hand side matrix b is not needed anymore.
*
* \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
* This function will const_cast it, so constness isn't honored here.
*
* \sa LLT::solve(), MatrixBase::llt()
*/
template<typename MatrixType, int _UpLo>
template<typename Derived>
void LLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const
void LLT<MatrixType,_UpLo>::solveInPlace(const MatrixBase<Derived> &bAndX) const
{
eigen_assert(m_isInitialized && "LLT is not initialized.");
eigen_assert(m_matrix.rows()==bAndX.rows());

View File

@@ -14,34 +14,40 @@ namespace Eigen {
namespace internal {
template<typename Scalar, typename CholmodType>
void cholmod_configure_matrix(CholmodType& mat)
{
if (internal::is_same<Scalar,float>::value)
{
mat.xtype = CHOLMOD_REAL;
mat.dtype = CHOLMOD_SINGLE;
}
else if (internal::is_same<Scalar,double>::value)
{
template<typename Scalar> struct cholmod_configure_matrix;
template<> struct cholmod_configure_matrix<double> {
template<typename CholmodType>
static void run(CholmodType& mat) {
mat.xtype = CHOLMOD_REAL;
mat.dtype = CHOLMOD_DOUBLE;
}
else if (internal::is_same<Scalar,std::complex<float> >::value)
{
mat.xtype = CHOLMOD_COMPLEX;
mat.dtype = CHOLMOD_SINGLE;
}
else if (internal::is_same<Scalar,std::complex<double> >::value)
{
};
template<> struct cholmod_configure_matrix<std::complex<double> > {
template<typename CholmodType>
static void run(CholmodType& mat) {
mat.xtype = CHOLMOD_COMPLEX;
mat.dtype = CHOLMOD_DOUBLE;
}
else
{
eigen_assert(false && "Scalar type not supported by CHOLMOD");
}
}
};
// Other scalar types are not yet suppotred by Cholmod
// template<> struct cholmod_configure_matrix<float> {
// template<typename CholmodType>
// static void run(CholmodType& mat) {
// mat.xtype = CHOLMOD_REAL;
// mat.dtype = CHOLMOD_SINGLE;
// }
// };
//
// template<> struct cholmod_configure_matrix<std::complex<float> > {
// template<typename CholmodType>
// static void run(CholmodType& mat) {
// mat.xtype = CHOLMOD_COMPLEX;
// mat.dtype = CHOLMOD_SINGLE;
// }
// };
} // namespace internal
@@ -49,11 +55,11 @@ void cholmod_configure_matrix(CholmodType& mat)
* Note that the data are shared.
*/
template<typename _Scalar, int _Options, typename _StorageIndex>
cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_StorageIndex>& mat)
cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> > mat)
{
cholmod_sparse res;
res.nzmax = mat.nonZeros();
res.nrow = mat.rows();;
res.nrow = mat.rows();
res.ncol = mat.cols();
res.p = mat.outerIndexPtr();
res.i = mat.innerIndexPtr();
@@ -88,7 +94,7 @@ cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_StorageIndex>& mat)
}
// setup res.xtype
internal::cholmod_configure_matrix<_Scalar>(res);
internal::cholmod_configure_matrix<_Scalar>::run(res);
res.stype = 0;
@@ -98,7 +104,14 @@ cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_StorageIndex>& mat)
template<typename _Scalar, int _Options, typename _Index>
const cholmod_sparse viewAsCholmod(const SparseMatrix<_Scalar,_Options,_Index>& mat)
{
cholmod_sparse res = viewAsCholmod(mat.const_cast_derived());
cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.const_cast_derived()));
return res;
}
template<typename _Scalar, int _Options, typename _Index>
const cholmod_sparse viewAsCholmod(const SparseVector<_Scalar,_Options,_Index>& mat)
{
cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.const_cast_derived()));
return res;
}
@@ -107,7 +120,7 @@ const cholmod_sparse viewAsCholmod(const SparseMatrix<_Scalar,_Options,_Index>&
template<typename _Scalar, int _Options, typename _Index, unsigned int UpLo>
cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<const SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
{
cholmod_sparse res = viewAsCholmod(mat.matrix().const_cast_derived());
cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.matrix().const_cast_derived()));
if(UpLo==Upper) res.stype = 1;
if(UpLo==Lower) res.stype = -1;
@@ -131,7 +144,7 @@ cholmod_dense viewAsCholmod(MatrixBase<Derived>& mat)
res.x = (void*)(mat.derived().data());
res.z = 0;
internal::cholmod_configure_matrix<Scalar>(res);
internal::cholmod_configure_matrix<Scalar>::run(res);
return res;
}
@@ -180,14 +193,16 @@ class CholmodBase : public SparseSolverBase<Derived>
CholmodBase()
: m_cholmodFactor(0), m_info(Success), m_factorizationIsOk(false), m_analysisIsOk(false)
{
m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0);
EIGEN_STATIC_ASSERT((internal::is_same<double,RealScalar>::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY);
m_shiftOffset[0] = m_shiftOffset[1] = 0.0;
cholmod_start(&m_cholmod);
}
explicit CholmodBase(const MatrixType& matrix)
: m_cholmodFactor(0), m_info(Success), m_factorizationIsOk(false), m_analysisIsOk(false)
{
m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0);
EIGEN_STATIC_ASSERT((internal::is_same<double,RealScalar>::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY);
m_shiftOffset[0] = m_shiftOffset[1] = 0.0;
cholmod_start(&m_cholmod);
compute(matrix);
}
@@ -254,7 +269,7 @@ class CholmodBase : public SparseSolverBase<Derived>
eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
cholmod_factorize_p(&A, m_shiftOffset, 0, 0, m_cholmodFactor, &m_cholmod);
// If the factorization failed, minor is the column at which it did. On success minor == n.
this->m_info = (m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue);
m_factorizationIsOk = true;
@@ -290,8 +305,8 @@ class CholmodBase : public SparseSolverBase<Derived>
}
/** \internal */
template<typename RhsScalar, int RhsOptions, typename RhsIndex, typename DestScalar, int DestOptions, typename DestIndex>
void _solve_impl(const SparseMatrix<RhsScalar,RhsOptions,RhsIndex> &b, SparseMatrix<DestScalar,DestOptions,DestIndex> &dest) const
template<typename RhsDerived, typename DestDerived>
void _solve_impl(const SparseMatrixBase<RhsDerived> &b, SparseMatrixBase<DestDerived> &dest) const
{
eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
const Index size = m_cholmodFactor->n;
@@ -299,7 +314,8 @@ class CholmodBase : public SparseSolverBase<Derived>
eigen_assert(size==b.rows());
// note: cs stands for Cholmod Sparse
cholmod_sparse b_cs = viewAsCholmod(b);
Ref<SparseMatrix<typename RhsDerived::Scalar,ColMajor,typename RhsDerived::StorageIndex> > b_ref(b.const_cast_derived());
cholmod_sparse b_cs = viewAsCholmod(b_ref);
cholmod_sparse* x_cs = cholmod_spsolve(CHOLMOD_A, m_cholmodFactor, &b_cs, &m_cholmod);
if(!x_cs)
{
@@ -307,7 +323,7 @@ class CholmodBase : public SparseSolverBase<Derived>
return;
}
// TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
dest = viewAsEigen<DestScalar,DestOptions,DestIndex>(*x_cs);
dest.derived() = viewAsEigen<typename DestDerived::Scalar,ColMajor,typename DestDerived::StorageIndex>(*x_cs);
cholmod_free_sparse(&x_cs, &m_cholmod);
}
#endif // EIGEN_PARSED_BY_DOXYGEN
@@ -324,7 +340,7 @@ class CholmodBase : public SparseSolverBase<Derived>
*/
Derived& setShift(const RealScalar& offset)
{
m_shiftOffset[0] = offset;
m_shiftOffset[0] = double(offset);
return derived();
}
@@ -386,7 +402,7 @@ class CholmodBase : public SparseSolverBase<Derived>
protected:
mutable cholmod_common m_cholmod;
cholmod_factor* m_cholmodFactor;
RealScalar m_shiftOffset[2];
double m_shiftOffset[2];
mutable ComputationInfo m_info;
int m_factorizationIsOk;
int m_analysisIsOk;
@@ -410,6 +426,8 @@ class CholmodBase : public SparseSolverBase<Derived>
*
* This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
*
* \warning Only double precision real and complex scalar types are supported by Cholmod.
*
* \sa \ref TutorialSparseSolverConcept, class CholmodSupernodalLLT, class SimplicialLLT
*/
template<typename _MatrixType, int _UpLo = Lower>
@@ -459,6 +477,8 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl
*
* This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
*
* \warning Only double precision real and complex scalar types are supported by Cholmod.
*
* \sa \ref TutorialSparseSolverConcept, class CholmodSupernodalLLT, class SimplicialLDLT
*/
template<typename _MatrixType, int _UpLo = Lower>
@@ -506,6 +526,8 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp
*
* This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
*
* \warning Only double precision real and complex scalar types are supported by Cholmod.
*
* \sa \ref TutorialSparseSolverConcept
*/
template<typename _MatrixType, int _UpLo = Lower>
@@ -555,6 +577,8 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
*
* This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
*
* \warning Only double precision real and complex scalar types are supported by Cholmod.
*
* \sa \ref TutorialSparseSolverConcept
*/
template<typename _MatrixType, int _UpLo = Lower>

View File

@@ -231,10 +231,16 @@ class Array
: Base(other)
{ }
private:
struct PrivateType {};
public:
/** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other)
EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other,
typename internal::enable_if<internal::is_convertible<typename OtherDerived::Scalar,Scalar>::value,
PrivateType>::type = PrivateType())
: Base(other.derived())
{ }

View File

@@ -175,7 +175,7 @@ template<typename Derived> class ArrayBase
*/
template<typename Derived>
template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived &
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
{
call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -188,7 +188,7 @@ ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
*/
template<typename Derived>
template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived &
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
{
call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -201,7 +201,7 @@ ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
*/
template<typename Derived>
template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived &
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
{
call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -214,7 +214,7 @@ ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
*/
template<typename Derived>
template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived &
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
{
call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar,typename OtherDerived::Scalar>());

View File

@@ -32,7 +32,8 @@ struct traits<ArrayWrapper<ExpressionType> >
// Let's remove NestByRefBit
enum {
Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
Flags = Flags0 & ~NestByRefBit
LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
};
};
}
@@ -54,6 +55,8 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
typedef typename internal::ref_selector<ExpressionType>::non_const_type NestedExpressionType;
using Base::coeffRef;
EIGEN_DEVICE_FUNC
explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
@@ -71,66 +74,18 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
EIGEN_DEVICE_FUNC
inline const Scalar* data() const { return m_expression.data(); }
EIGEN_DEVICE_FUNC
inline CoeffReturnType coeff(Index rowId, Index colId) const
{
return m_expression.coeff(rowId, colId);
}
EIGEN_DEVICE_FUNC
inline Scalar& coeffRef(Index rowId, Index colId)
{
return m_expression.coeffRef(rowId, colId);
}
EIGEN_DEVICE_FUNC
inline const Scalar& coeffRef(Index rowId, Index colId) const
{
return m_expression.coeffRef(rowId, colId);
}
EIGEN_DEVICE_FUNC
inline CoeffReturnType coeff(Index index) const
{
return m_expression.coeff(index);
}
EIGEN_DEVICE_FUNC
inline Scalar& coeffRef(Index index)
{
return m_expression.coeffRef(index);
}
EIGEN_DEVICE_FUNC
inline const Scalar& coeffRef(Index index) const
{
return m_expression.coeffRef(index);
}
template<int LoadMode>
inline const PacketScalar packet(Index rowId, Index colId) const
{
return m_expression.template packet<LoadMode>(rowId, colId);
}
template<int LoadMode>
inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
{
m_expression.template writePacket<LoadMode>(rowId, colId, val);
}
template<int LoadMode>
inline const PacketScalar packet(Index index) const
{
return m_expression.template packet<LoadMode>(index);
}
template<int LoadMode>
inline void writePacket(Index index, const PacketScalar& val)
{
m_expression.template writePacket<LoadMode>(index, val);
}
template<typename Dest>
EIGEN_DEVICE_FUNC
inline void evalTo(Dest& dst) const { dst = m_expression; }
@@ -175,7 +130,8 @@ struct traits<MatrixWrapper<ExpressionType> >
// Let's remove NestByRefBit
enum {
Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
Flags = Flags0 & ~NestByRefBit
LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
};
};
}
@@ -197,6 +153,8 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
typedef typename internal::ref_selector<ExpressionType>::non_const_type NestedExpressionType;
using Base::coeffRef;
EIGEN_DEVICE_FUNC
explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {}
@@ -214,66 +172,18 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
EIGEN_DEVICE_FUNC
inline const Scalar* data() const { return m_expression.data(); }
EIGEN_DEVICE_FUNC
inline CoeffReturnType coeff(Index rowId, Index colId) const
{
return m_expression.coeff(rowId, colId);
}
EIGEN_DEVICE_FUNC
inline Scalar& coeffRef(Index rowId, Index colId)
{
return m_expression.coeffRef(rowId, colId);
}
EIGEN_DEVICE_FUNC
inline const Scalar& coeffRef(Index rowId, Index colId) const
{
return m_expression.derived().coeffRef(rowId, colId);
}
EIGEN_DEVICE_FUNC
inline CoeffReturnType coeff(Index index) const
{
return m_expression.coeff(index);
}
EIGEN_DEVICE_FUNC
inline Scalar& coeffRef(Index index)
{
return m_expression.coeffRef(index);
}
EIGEN_DEVICE_FUNC
inline const Scalar& coeffRef(Index index) const
{
return m_expression.coeffRef(index);
}
template<int LoadMode>
inline const PacketScalar packet(Index rowId, Index colId) const
{
return m_expression.template packet<LoadMode>(rowId, colId);
}
template<int LoadMode>
inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
{
m_expression.template writePacket<LoadMode>(rowId, colId, val);
}
template<int LoadMode>
inline const PacketScalar packet(Index index) const
{
return m_expression.template packet<LoadMode>(index);
}
template<int LoadMode>
inline void writePacket(Index index, const PacketScalar& val)
{
m_expression.template writePacket<LoadMode>(index, val);
}
EIGEN_DEVICE_FUNC
const typename internal::remove_all<NestedExpressionType>::type&
nestedExpression() const

View File

@@ -39,7 +39,7 @@ public:
enum {
DstAlignment = DstEvaluator::Alignment,
SrcAlignment = SrcEvaluator::Alignment,
DstHasDirectAccess = DstFlags & DirectAccessBit,
DstHasDirectAccess = (DstFlags & DirectAccessBit) == DirectAccessBit,
JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment)
};
@@ -83,7 +83,7 @@ private:
&& int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0
&& (EIGEN_UNALIGNED_VECTORIZE || int(JointAlignment)>=int(InnerRequiredAlignment)),
MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess
MayLinearVectorize = bool(MightVectorize) && bool(MayLinearize) && bool(DstHasDirectAccess)
&& (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
/* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
so it's only good for large enough sizes. */
@@ -407,7 +407,7 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
: int(Kernel::AssignmentTraits::DstAlignment),
srcAlignment = Kernel::AssignmentTraits::JointAlignment
};
const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(&kernel.dstEvaluator().coeffRef(0), size);
const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(kernel.dstDataPtr(), size);
const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
unaligned_dense_assignment_loop<dstIsAligned!=0>::run(kernel, 0, alignedStart);
@@ -515,7 +515,7 @@ struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling>
template<typename Kernel>
struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
{
EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
{
typedef typename Kernel::Scalar Scalar;
typedef typename Kernel::PacketType PacketType;
@@ -527,7 +527,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
dstAlignment = alignable ? int(requestedAlignment)
: int(Kernel::AssignmentTraits::DstAlignment)
};
const Scalar *dst_ptr = &kernel.dstEvaluator().coeffRef(0,0);
const Scalar *dst_ptr = kernel.dstDataPtr();
if((!bool(dstIsAligned)) && (UIntPtr(dst_ptr) % sizeof(Scalar))>0)
{
// the pointer is not aligend-on scalar, so alignment is not possible
@@ -554,7 +554,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
for(Index inner = alignedEnd; inner<innerSize ; ++inner)
kernel.assignCoeffByOuterInner(outer, inner);
alignedStart = std::min<Index>((alignedStart+alignedStep)%packetSize, innerSize);
alignedStart = numext::mini((alignedStart+alignedStep)%packetSize, innerSize);
}
}
};
@@ -563,7 +563,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
template<typename Kernel>
struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling>
{
EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
{
typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
typedef typename Kernel::PacketType PacketType;
@@ -683,6 +683,11 @@ public:
: int(DstEvaluatorType::Flags)&RowMajorBit ? inner
: outer;
}
EIGEN_DEVICE_FUNC const Scalar* dstDataPtr() const
{
return m_dstExpr.data();
}
protected:
DstEvaluatorType& m_dst;
@@ -696,16 +701,39 @@ protected:
* Part 5 : Entry point for dense rectangular assignment
***************************************************************************/
template<typename DstXprType, typename SrcXprType, typename Functor>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func)
template<typename DstXprType,typename SrcXprType, typename Functor>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const Functor &/*func*/)
{
EIGEN_ONLY_USED_FOR_DEBUG(dst);
EIGEN_ONLY_USED_FOR_DEBUG(src);
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
}
template<typename DstXprType,typename SrcXprType, typename T1, typename T2>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const internal::assign_op<T1,T2> &/*func*/)
{
Index dstRows = src.rows();
Index dstCols = src.cols();
if(((dst.rows()!=dstRows) || (dst.cols()!=dstCols)))
dst.resize(dstRows, dstCols);
eigen_assert(dst.rows() == dstRows && dst.cols() == dstCols);
}
template<typename DstXprType, typename SrcXprType, typename Functor>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func)
{
typedef evaluator<DstXprType> DstEvaluatorType;
typedef evaluator<SrcXprType> SrcEvaluatorType;
DstEvaluatorType dstEvaluator(dst);
SrcEvaluatorType srcEvaluator(src);
// NOTE To properly handle A = (A*A.transpose())/s with A rectangular,
// we need to resize the destination after the source evaluator has been created.
resize_if_allowed(dst, src, func);
DstEvaluatorType dstEvaluator(dst);
typedef generic_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
@@ -714,7 +742,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstX
}
template<typename DstXprType, typename SrcXprType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src)
{
call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
}
@@ -796,11 +824,6 @@ void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
) && int(Dst::SizeAtCompileTime) != 1
};
Index dstRows = NeedToTranspose ? src.cols() : src.rows();
Index dstCols = NeedToTranspose ? src.rows() : src.cols();
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
dst.resize(dstRows, dstCols);
typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst>::type ActualDstTypeCleaned;
typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst&>::type ActualDstType;
ActualDstType actualDst(dst);
@@ -823,15 +846,11 @@ template<typename Dst, typename Src, typename Func>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func)
{
Index dstRows = src.rows();
Index dstCols = src.cols();
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
dst.resize(dstRows, dstCols);
// TODO check whether this is the right place to perform these checks:
EIGEN_STATIC_ASSERT_LVALUE(Dst)
EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src)
EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename Dst::Scalar,typename Src::Scalar);
Assignment<Dst,Src,Func>::run(dst, src, func);
}
template<typename Dst, typename Src>
@@ -853,8 +872,6 @@ struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak>
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
{
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
#ifndef EIGEN_NO_DEBUG
internal::check_for_aliasing(dst, src);
#endif
@@ -873,9 +890,42 @@ struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
{
Index dstRows = src.rows();
Index dstCols = src.cols();
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
dst.resize(dstRows, dstCols);
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
src.evalTo(dst);
}
// NOTE The following two functions are templated to avoid their instanciation if not needed
// This is needed because some expressions supports evalTo only and/or have 'void' as scalar type.
template<typename SrcScalarType>
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,SrcScalarType> &/*func*/)
{
Index dstRows = src.rows();
Index dstCols = src.cols();
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
dst.resize(dstRows, dstCols);
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
src.addTo(dst);
}
template<typename SrcScalarType>
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,SrcScalarType> &/*func*/)
{
Index dstRows = src.rows();
Index dstCols = src.cols();
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
dst.resize(dstRows, dstCols);
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
src.subTo(dst);
}
};
} // namespace internal

View File

@@ -84,7 +84,8 @@ class vml_assign_traits
struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE,EIGENTYPE>, \
Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> { \
typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType; \
static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) { \
static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) { \
resize_if_allowed(dst, src, func); \
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) { \
VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(), \
@@ -144,7 +145,8 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _)
Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> { \
typedef CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested, \
const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> > SrcXprType; \
static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) { \
static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) { \
resize_if_allowed(dst, src, func); \
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.rhs().functor().m_other); \
if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) \

View File

@@ -817,73 +817,79 @@ struct mapbase_evaluator : evaluator_base<Derived>
ColsAtCompileTime = XprType::ColsAtCompileTime,
CoeffReadCost = NumTraits<Scalar>::ReadCost
};
EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map)
: m_data(const_cast<PointerType>(map.data())),
m_xpr(map)
: m_data(const_cast<PointerType>(map.data())),
m_innerStride(map.innerStride()),
m_outerStride(map.outerStride())
{
EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(evaluator<Derived>::Flags&PacketAccessBit, internal::inner_stride_at_compile_time<Derived>::ret==1),
PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
CoeffReturnType coeff(Index row, Index col) const
{
return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
return m_data[col * colStride() + row * rowStride()];
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
CoeffReturnType coeff(Index index) const
{
return m_data[index * m_xpr.innerStride()];
return m_data[index * m_innerStride.value()];
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Scalar& coeffRef(Index row, Index col)
{
return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
return m_data[col * colStride() + row * rowStride()];
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Scalar& coeffRef(Index index)
{
return m_data[index * m_xpr.innerStride()];
return m_data[index * m_innerStride.value()];
}
template<int LoadMode, typename PacketType>
EIGEN_STRONG_INLINE
PacketType packet(Index row, Index col) const
PacketType packet(Index row, Index col) const
{
PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
PointerType ptr = m_data + row * rowStride() + col * colStride();
return internal::ploadt<PacketType, LoadMode>(ptr);
}
template<int LoadMode, typename PacketType>
EIGEN_STRONG_INLINE
PacketType packet(Index index) const
PacketType packet(Index index) const
{
return internal::ploadt<PacketType, LoadMode>(m_data + index * m_xpr.innerStride());
return internal::ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
}
template<int StoreMode, typename PacketType>
EIGEN_STRONG_INLINE
void writePacket(Index row, Index col, const PacketType& x)
void writePacket(Index row, Index col, const PacketType& x)
{
PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
PointerType ptr = m_data + row * rowStride() + col * colStride();
return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x);
}
template<int StoreMode, typename PacketType>
EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketType& x)
void writePacket(Index index, const PacketType& x)
{
internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_xpr.innerStride(), x);
internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
}
protected:
EIGEN_DEVICE_FUNC
inline Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); }
EIGEN_DEVICE_FUNC
inline Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); }
PointerType m_data;
const XprType& m_xpr;
const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
const internal::variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
};
template<typename PlainObjectType, int MapOptions, typename StrideType>
@@ -971,7 +977,7 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
OuterStrideAtCompileTime = HasSameStorageOrderAsArgType
? int(outer_stride_at_compile_time<ArgType>::ret)
: int(inner_stride_at_compile_time<ArgType>::ret),
MaskPacketAccessBit = (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0,
MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0,
FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,
FlagsRowMajorBit = XprType::Flags&RowMajorBit,
@@ -981,7 +987,9 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit,
PacketAlignment = unpacket_traits<PacketScalar>::alignment,
Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0,
Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic)
&& (OuterStrideAtCompileTime!=0)
&& (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0,
Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0)
};
typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
@@ -1012,14 +1020,16 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block)
: m_argImpl(block.nestedExpression()),
m_startRow(block.startRow()),
m_startCol(block.startCol())
m_startCol(block.startCol()),
m_linear_offset(InnerPanel?(XprType::IsRowMajor ? block.startRow()*block.cols() : block.startCol()*block.rows()):0)
{ }
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
enum {
RowsAtCompileTime = XprType::RowsAtCompileTime
RowsAtCompileTime = XprType::RowsAtCompileTime,
ForwardLinearAccess = InnerPanel && bool(evaluator<ArgType>::Flags&LinearAccessBit)
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -1031,7 +1041,10 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
CoeffReturnType coeff(Index index) const
{
return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
if (ForwardLinearAccess)
return m_argImpl.coeff(m_linear_offset.value() + index);
else
return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -1043,7 +1056,10 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Scalar& coeffRef(Index index)
{
return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
if (ForwardLinearAccess)
return m_argImpl.coeffRef(m_linear_offset.value() + index);
else
return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
}
template<int LoadMode, typename PacketType>
@@ -1057,8 +1073,11 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
EIGEN_STRONG_INLINE
PacketType packet(Index index) const
{
return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
RowsAtCompileTime == 1 ? index : 0);
if (ForwardLinearAccess)
return m_argImpl.template packet<LoadMode,PacketType>(m_linear_offset.value() + index);
else
return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
RowsAtCompileTime == 1 ? index : 0);
}
template<int StoreMode, typename PacketType>
@@ -1072,15 +1091,19 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketType& x)
{
return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
RowsAtCompileTime == 1 ? index : 0,
x);
if (ForwardLinearAccess)
return m_argImpl.template writePacket<StoreMode,PacketType>(m_linear_offset.value() + index, x);
else
return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
RowsAtCompileTime == 1 ? index : 0,
x);
}
protected:
evaluator<ArgType> m_argImpl;
const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
const variable_if_dynamic<Index, InnerPanel ? Dynamic : 0> m_linear_offset;
};
// TODO: This evaluator does not actually use the child evaluator;
@@ -1296,7 +1319,7 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
}
protected:
const ArgTypeNested m_arg;
typename internal::add_const_on_value_type<ArgTypeNested>::type m_arg;
const MemberOp m_functor;
};
@@ -1550,9 +1573,7 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
{ }
typedef typename XprType::Scalar Scalar;
// FIXME having to check whether ArgType is sparse here i not very nice.
typedef typename internal::conditional<!internal::is_same<typename ArgType::StorageKind,Sparse>::value,
typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType;
typedef typename XprType::CoeffReturnType CoeffReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
CoeffReturnType coeff(Index row, Index) const

View File

@@ -46,7 +46,7 @@ struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
typedef typename remove_reference<LhsNested>::type _LhsNested;
typedef typename remove_reference<RhsNested>::type _RhsNested;
enum {
Flags = _LhsNested::Flags & RowMajorBit
Flags = cwise_promote_storage_order<typename traits<Lhs>::StorageKind,typename traits<Rhs>::StorageKind,_LhsNested::Flags & RowMajorBit,_RhsNested::Flags & RowMajorBit>::value
};
};
} // end namespace internal
@@ -84,6 +84,7 @@ class CwiseBinaryOp :
{
public:
typedef typename internal::remove_all<BinaryOp>::type Functor;
typedef typename internal::remove_all<LhsType>::type Lhs;
typedef typename internal::remove_all<RhsType>::type Rhs;

View File

@@ -105,7 +105,7 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
*/
template<typename Derived>
template<typename CustomNullaryOp>
EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
{
return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
@@ -150,7 +150,7 @@ DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
*/
template<typename Derived>
template<typename CustomNullaryOp>
EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
{
return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
@@ -192,7 +192,7 @@ DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
* \sa class CwiseNullaryOp
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Constant(Index size, const Scalar& value)
{
return DenseBase<Derived>::NullaryExpr(size, internal::scalar_constant_op<Scalar>(value));
@@ -208,49 +208,36 @@ DenseBase<Derived>::Constant(Index size, const Scalar& value)
* \sa class CwiseNullaryOp
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Constant(const Scalar& value)
{
EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
return DenseBase<Derived>::NullaryExpr(RowsAtCompileTime, ColsAtCompileTime, internal::scalar_constant_op<Scalar>(value));
}
/**
* \brief Sets a linearly spaced vector.
/** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(Index,const Scalar&,const Scalar&)
*
* The function generates 'size' equally spaced values in the closed interval [low,high].
* This particular version of LinSpaced() uses sequential access, i.e. vector access is
* assumed to be a(0), a(1), ..., a(size-1). This assumption allows for better vectorization
* and yields faster code than the random access version.
*
* When size is set to 1, a vector of length 1 containing 'high' is returned.
*
* \only_for_vectors
*
* Example: \include DenseBase_LinSpaced_seq.cpp
* Output: \verbinclude DenseBase_LinSpaced_seq.out
*
* \sa setLinSpaced(Index,const Scalar&,const Scalar&), LinSpaced(Index,Scalar,Scalar), CwiseNullaryOp
* \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&)
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::SequentialLinSpacedReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,PacketScalar,false>(low,high,size));
return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,PacketScalar>(low,high,size));
}
/**
* \copydoc DenseBase::LinSpaced(Sequential_t, Index, const Scalar&, const Scalar&)
* Special version for fixed size types which does not require the size parameter.
/** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(const Scalar&,const Scalar&)
*
* \sa LinSpaced(Scalar,Scalar)
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::SequentialLinSpacedReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,PacketScalar,false>(low,high,Derived::SizeAtCompileTime));
return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,PacketScalar>(low,high,Derived::SizeAtCompileTime));
}
/**
@@ -264,14 +251,24 @@ DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig
* Example: \include DenseBase_LinSpaced.cpp
* Output: \verbinclude DenseBase_LinSpaced.out
*
* \sa setLinSpaced(Index,const Scalar&,const Scalar&), LinSpaced(Sequential_t,Index,const Scalar&,const Scalar&,Index), CwiseNullaryOp
* For integer scalar types, an even spacing is possible if and only if the length of the range,
* i.e., \c high-low is a scalar multiple of \c size-1, or if \c size is a scalar multiple of the
* number of values \c high-low+1 (meaning each value can be repeated the same number of time).
* If one of these two considions is not satisfied, then \c high is lowered to the largest value
* satisfying one of this constraint.
* Here are some examples:
*
* Example: \include DenseBase_LinSpacedInt.cpp
* Output: \verbinclude DenseBase_LinSpacedInt.out
*
* \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,PacketScalar,true>(low,high,size));
return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,PacketScalar>(low,high,size));
}
/**
@@ -279,17 +276,17 @@ DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
* Special version for fixed size types which does not require the size parameter.
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,PacketScalar,true>(low,high,Derived::SizeAtCompileTime));
return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,PacketScalar>(low,high,Derived::SizeAtCompileTime));
}
/** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
template<typename Derived>
bool DenseBase<Derived>::isApproxToConstant
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApproxToConstant
(const Scalar& val, const RealScalar& prec) const
{
typename internal::nested_eval<Derived,1>::type self(derived());
@@ -304,7 +301,7 @@ bool DenseBase<Derived>::isApproxToConstant
*
* \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */
template<typename Derived>
bool DenseBase<Derived>::isConstant
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isConstant
(const Scalar& val, const RealScalar& prec) const
{
return isApproxToConstant(val, prec);
@@ -315,7 +312,7 @@ bool DenseBase<Derived>::isConstant
* \sa setConstant(), Constant(), class CwiseNullaryOp
*/
template<typename Derived>
EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
{
setConstant(val);
}
@@ -325,7 +322,7 @@ EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
* \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes()
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
{
return derived() = Constant(rows(), cols(), val);
}
@@ -340,7 +337,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
* \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived&
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
{
resize(size);
@@ -359,7 +356,7 @@ PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
* \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived&
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
{
resize(rows, cols);
@@ -377,27 +374,33 @@ PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
* Example: \include DenseBase_setLinSpaced.cpp
* Output: \verbinclude DenseBase_setLinSpaced.out
*
* \sa CwiseNullaryOp
* For integer scalar types, do not miss the explanations on the definition
* of \link LinSpaced(Index,const Scalar&,const Scalar&) even spacing \endlink.
*
* \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar,false>(low,high,newSize));
return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar>(low,high,newSize));
}
/**
* \brief Sets a linearly spaced vector.
*
* The function fills *this with equally spaced values in the closed interval [low,high].
* The function fills \c *this with equally spaced values in the closed interval [low,high].
* When size is set to 1, a vector of length 1 containing 'high' is returned.
*
* \only_for_vectors
*
* \sa setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
* For integer scalar types, do not miss the explanations on the definition
* of \link LinSpaced(Index,const Scalar&,const Scalar&) even spacing \endlink.
*
* \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
return setLinSpaced(size(), low, high);
@@ -420,7 +423,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low,
* \sa Zero(), Zero(Index)
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Zero(Index rows, Index cols)
{
return Constant(rows, cols, Scalar(0));
@@ -443,7 +446,7 @@ DenseBase<Derived>::Zero(Index rows, Index cols)
* \sa Zero(), Zero(Index,Index)
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Zero(Index size)
{
return Constant(size, Scalar(0));
@@ -460,7 +463,7 @@ DenseBase<Derived>::Zero(Index size)
* \sa Zero(Index), Zero(Index,Index)
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Zero()
{
return Constant(Scalar(0));
@@ -475,7 +478,7 @@ DenseBase<Derived>::Zero()
* \sa class CwiseNullaryOp, Zero()
*/
template<typename Derived>
bool DenseBase<Derived>::isZero(const RealScalar& prec) const
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isZero(const RealScalar& prec) const
{
typename internal::nested_eval<Derived,1>::type self(derived());
for(Index j = 0; j < cols(); ++j)
@@ -493,7 +496,7 @@ bool DenseBase<Derived>::isZero(const RealScalar& prec) const
* \sa class CwiseNullaryOp, Zero()
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
{
return setConstant(Scalar(0));
}
@@ -508,7 +511,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
* \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero()
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived&
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setZero(Index newSize)
{
resize(newSize);
@@ -526,7 +529,7 @@ PlainObjectBase<Derived>::setZero(Index newSize)
* \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero()
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived&
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setZero(Index rows, Index cols)
{
resize(rows, cols);
@@ -550,7 +553,7 @@ PlainObjectBase<Derived>::setZero(Index rows, Index cols)
* \sa Ones(), Ones(Index), isOnes(), class Ones
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Ones(Index rows, Index cols)
{
return Constant(rows, cols, Scalar(1));
@@ -573,7 +576,7 @@ DenseBase<Derived>::Ones(Index rows, Index cols)
* \sa Ones(), Ones(Index,Index), isOnes(), class Ones
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Ones(Index newSize)
{
return Constant(newSize, Scalar(1));
@@ -590,7 +593,7 @@ DenseBase<Derived>::Ones(Index newSize)
* \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Ones()
{
return Constant(Scalar(1));
@@ -605,7 +608,7 @@ DenseBase<Derived>::Ones()
* \sa class CwiseNullaryOp, Ones()
*/
template<typename Derived>
bool DenseBase<Derived>::isOnes
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isOnes
(const RealScalar& prec) const
{
return isApproxToConstant(Scalar(1), prec);
@@ -619,7 +622,7 @@ bool DenseBase<Derived>::isOnes
* \sa class CwiseNullaryOp, Ones()
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
{
return setConstant(Scalar(1));
}
@@ -634,7 +637,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
* \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones()
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived&
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setOnes(Index newSize)
{
resize(newSize);
@@ -652,7 +655,7 @@ PlainObjectBase<Derived>::setOnes(Index newSize)
* \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones()
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived&
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
{
resize(rows, cols);
@@ -676,7 +679,7 @@ PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
* \sa Identity(), setIdentity(), isIdentity()
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
MatrixBase<Derived>::Identity(Index rows, Index cols)
{
return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_identity_op<Scalar>());
@@ -693,7 +696,7 @@ MatrixBase<Derived>::Identity(Index rows, Index cols)
* \sa Identity(Index,Index), setIdentity(), isIdentity()
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
MatrixBase<Derived>::Identity()
{
EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
@@ -752,7 +755,7 @@ struct setIdentity_impl<Derived, true>
static EIGEN_STRONG_INLINE Derived& run(Derived& m)
{
m.setZero();
const Index size = (std::min)(m.rows(), m.cols());
const Index size = numext::mini(m.rows(), m.cols());
for(Index i = 0; i < size; ++i) m.coeffRef(i,i) = typename Derived::Scalar(1);
return m;
}
@@ -768,7 +771,7 @@ struct setIdentity_impl<Derived, true>
* \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity()
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
{
return internal::setIdentity_impl<Derived>::run(derived());
}
@@ -784,7 +787,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
* \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity()
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
{
derived().resize(rows, cols);
return setIdentity();
@@ -797,7 +800,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index
* \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i);
@@ -812,7 +815,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
* \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
return BasisReturnType(SquareMatrixType::Identity(),i);
@@ -825,7 +828,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
{ return Derived::Unit(0); }
/** \returns an expression of the Y axis unit vector (0,1{,0}^*)
@@ -835,7 +838,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
{ return Derived::Unit(1); }
/** \returns an expression of the Z axis unit vector (0,0,1{,0}^*)
@@ -845,7 +848,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
{ return Derived::Unit(2); }
/** \returns an expression of the W axis unit vector (0,0,0,1)
@@ -855,7 +858,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
{ return Derived::Unit(3); }
} // end namespace Eigen

View File

@@ -260,10 +260,10 @@ template<typename Derived> class DenseBase
#ifndef EIGEN_PARSED_BY_DOXYGEN
/** \internal Represents a matrix with all coefficients equal to one another*/
typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
/** \internal Represents a vector with linearly spaced coefficients that allows sequential access only. */
typedef CwiseNullaryOp<internal::linspaced_op<Scalar,PacketScalar,false>,PlainObject> SequentialLinSpacedReturnType;
/** \internal \deprecated Represents a vector with linearly spaced coefficients that allows sequential access only. */
typedef CwiseNullaryOp<internal::linspaced_op<Scalar,PacketScalar>,PlainObject> SequentialLinSpacedReturnType;
/** \internal Represents a vector with linearly spaced coefficients that allows random access. */
typedef CwiseNullaryOp<internal::linspaced_op<Scalar,PacketScalar,true>,PlainObject> RandomAccessLinSpacedReturnType;
typedef CwiseNullaryOp<internal::linspaced_op<Scalar,PacketScalar>,PlainObject> RandomAccessLinSpacedReturnType;
/** \internal the return type of MatrixBase::eigenvalues() */
typedef Matrix<typename NumTraits<typename internal::traits<Derived>::Scalar>::Real, internal::traits<Derived>::ColsAtCompileTime, 1> EigenvaluesReturnType;
@@ -296,7 +296,7 @@ template<typename Derived> class DenseBase
EIGEN_DEVICE_FUNC
Derived& operator=(const ReturnByValue<OtherDerived>& func);
/** \ínternal
/** \internal
* Copies \a other into *this without evaluating other. \returns a reference to *this.
* \deprecated */
template<typename OtherDerived>
@@ -463,7 +463,17 @@ template<typename Derived> class DenseBase
EIGEN_DEVICE_FUNC
void visit(Visitor& func) const;
inline const WithFormat<Derived> format(const IOFormat& fmt) const;
/** \returns a WithFormat proxy object allowing to print a matrix the with given
* format \a fmt.
*
* See class IOFormat for some examples.
*
* \sa class IOFormat, class WithFormat
*/
inline const WithFormat<Derived> format(const IOFormat& fmt) const
{
return WithFormat<Derived>(derived(), fmt);
}
/** \returns the unique coefficient of a 1x1 expression */
EIGEN_DEVICE_FUNC
@@ -474,9 +484,9 @@ template<typename Derived> class DenseBase
return derived().coeff(0,0);
}
bool all() const;
bool any() const;
Index count() const;
EIGEN_DEVICE_FUNC bool all() const;
EIGEN_DEVICE_FUNC bool any() const;
EIGEN_DEVICE_FUNC Index count() const;
typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
typedef const VectorwiseOp<const Derived, Horizontal> ConstRowwiseReturnType;

View File

@@ -624,7 +624,7 @@ struct first_aligned_impl<Alignment, Derived, false>
{
static inline Index run(const Derived& m)
{
return internal::first_aligned<Alignment>(&m.const_cast_derived().coeffRef(0,0), m.size());
return internal::first_aligned<Alignment>(m.data(), m.size());
}
};

View File

@@ -13,9 +13,9 @@
#define EIGEN_MATRIXSTORAGE_H
#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
#define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN EIGEN_DENSE_STORAGE_CTOR_PLUGIN;
#define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X) X; EIGEN_DENSE_STORAGE_CTOR_PLUGIN;
#else
#define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
#define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X)
#endif
namespace Eigen {
@@ -184,12 +184,16 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
{
internal::plain_array<T,Size,_Options> m_data;
public:
EIGEN_DEVICE_FUNC DenseStorage() {}
EIGEN_DEVICE_FUNC DenseStorage() {
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
}
EIGEN_DEVICE_FUNC
explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
: m_data(internal::constructor_without_unaligned_array_assert()) {}
EIGEN_DEVICE_FUNC
DenseStorage(const DenseStorage& other) : m_data(other.m_data) {}
DenseStorage(const DenseStorage& other) : m_data(other.m_data) {
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
}
EIGEN_DEVICE_FUNC
DenseStorage& operator=(const DenseStorage& other)
{
@@ -197,7 +201,7 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
return *this;
}
EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) {
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols);
EIGEN_UNUSED_VARIABLE(size);
EIGEN_UNUSED_VARIABLE(rows);
@@ -343,7 +347,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols)
: m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows), m_cols(cols)
{
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
eigen_internal_assert(size==rows*cols && rows>=0 && cols >=0);
}
EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
@@ -351,6 +355,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
, m_rows(other.m_rows)
, m_cols(other.m_cols)
{
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*m_cols)
internal::smart_copy(other.m_data, other.m_data+other.m_rows*other.m_cols, m_data);
}
EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
@@ -403,7 +408,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
else
m_data = 0;
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
}
m_rows = rows;
m_cols = cols;
@@ -422,7 +427,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(cols)
{
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
eigen_internal_assert(size==rows*cols && rows==_Rows && cols >=0);
EIGEN_UNUSED_VARIABLE(rows);
}
@@ -430,6 +435,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
: m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(_Rows*other.m_cols))
, m_cols(other.m_cols)
{
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_cols*_Rows)
internal::smart_copy(other.m_data, other.m_data+_Rows*m_cols, m_data);
}
EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
@@ -477,7 +483,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
else
m_data = 0;
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
}
m_cols = cols;
}
@@ -495,7 +501,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows)
{
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
eigen_internal_assert(size==rows*cols && rows>=0 && cols == _Cols);
EIGEN_UNUSED_VARIABLE(cols);
}
@@ -503,6 +509,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
: m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*_Cols))
, m_rows(other.m_rows)
{
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*_Cols)
internal::smart_copy(other.m_data, other.m_data+other.m_rows*_Cols, m_data);
}
EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
@@ -550,7 +557,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
else
m_data = 0;
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
}
m_rows = rows;
}

View File

@@ -21,7 +21,7 @@ namespace Eigen {
* \param MatrixType the type of the object in which we are taking a sub/main/super diagonal
* \param DiagIndex the index of the sub/super diagonal. The default is 0 and it means the main diagonal.
* A positive value means a superdiagonal, a negative value means a subdiagonal.
* You can also use Dynamic so the index can be set at runtime.
* You can also use DynamicIndex so the index can be set at runtime.
*
* The matrix is not required to be square.
*
@@ -70,7 +70,10 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)
EIGEN_DEVICE_FUNC
explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}
explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index)
{
eigen_assert( a_index <= m_matrix.cols() && -a_index <= m_matrix.rows() );
}
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)

View File

@@ -290,12 +290,11 @@ MatrixBase<Derived>::asDiagonal() const
template<typename Derived>
bool MatrixBase<Derived>::isDiagonal(const RealScalar& prec) const
{
using std::abs;
if(cols() != rows()) return false;
RealScalar maxAbsOnDiagonal = static_cast<RealScalar>(-1);
for(Index j = 0; j < cols(); ++j)
{
RealScalar absOnDiagonal = abs(coeff(j,j));
RealScalar absOnDiagonal = numext::abs(coeff(j,j));
if(absOnDiagonal > maxAbsOnDiagonal) maxAbsOnDiagonal = absOnDiagonal;
}
for(Index j = 0; j < cols(); ++j)
@@ -321,6 +320,11 @@ struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense>
{
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
{
Index dstRows = src.rows();
Index dstCols = src.cols();
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
dst.resize(dstRows, dstCols);
dst.setZero();
dst.diagonal() = src.diagonal();
}

View File

@@ -31,7 +31,8 @@ struct dot_nocheck
typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
typedef typename conj_prod::result_type ResScalar;
EIGEN_DEVICE_FUNC
static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
EIGEN_STRONG_INLINE
static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
{
return a.template binaryExpr<conj_prod>(b).sum();
}
@@ -43,7 +44,8 @@ struct dot_nocheck<T, U, true>
typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
typedef typename conj_prod::result_type ResScalar;
EIGEN_DEVICE_FUNC
static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
EIGEN_STRONG_INLINE
static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
{
return a.transpose().template binaryExpr<conj_prod>(b).sum();
}
@@ -51,7 +53,8 @@ struct dot_nocheck<T, U, true>
} // end namespace internal
/** \returns the dot product of *this with other.
/** \fn MatrixBase::dot
* \returns the dot product of *this with other.
*
* \only_for_vectors
*
@@ -64,15 +67,18 @@ struct dot_nocheck<T, U, true>
template<typename Derived>
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived,OtherDerived)
#if !(defined(EIGEN_NO_STATIC_ASSERT) && defined(EIGEN_NO_DEBUG))
typedef internal::scalar_conj_product_op<Scalar,typename OtherDerived::Scalar> func;
EIGEN_CHECK_BINARY_COMPATIBILIY(func,Scalar,typename OtherDerived::Scalar);
#endif
eigen_assert(size() == other.size());
return internal::dot_nocheck<Derived,OtherDerived>::run(*this, other);
@@ -99,7 +105,7 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
* \sa lpNorm(), dot(), squaredNorm()
*/
template<typename Derived>
inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
{
return numext::sqrt(squaredNorm());
}
@@ -114,7 +120,7 @@ inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real Matr
* \sa norm(), normalize()
*/
template<typename Derived>
inline const typename MatrixBase<Derived>::PlainObject
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
MatrixBase<Derived>::normalized() const
{
typedef typename internal::nested_eval<Derived,2>::type _Nested;
@@ -136,7 +142,7 @@ MatrixBase<Derived>::normalized() const
* \sa norm(), normalized()
*/
template<typename Derived>
inline void MatrixBase<Derived>::normalize()
EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
{
RealScalar z = squaredNorm();
// NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
@@ -157,7 +163,7 @@ inline void MatrixBase<Derived>::normalize()
* \sa stableNorm(), stableNormalize(), normalized()
*/
template<typename Derived>
inline const typename MatrixBase<Derived>::PlainObject
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
MatrixBase<Derived>::stableNormalized() const
{
typedef typename internal::nested_eval<Derived,3>::type _Nested;
@@ -182,7 +188,7 @@ MatrixBase<Derived>::stableNormalized() const
* \sa stableNorm(), stableNormalized(), normalize()
*/
template<typename Derived>
inline void MatrixBase<Derived>::stableNormalize()
EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
{
RealScalar w = cwiseAbs().maxCoeff();
RealScalar z = (derived()/w).squaredNorm();

View File

@@ -14,6 +14,7 @@
namespace Eigen {
/** \class EigenBase
* \ingroup Core_Module
*
* Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
*
@@ -128,6 +129,7 @@ template<typename Derived> struct EigenBase
*/
template<typename Derived>
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
{
call_assignment(derived(), other.derived());
@@ -136,6 +138,7 @@ Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
template<typename Derived>
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
{
call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -144,6 +147,7 @@ Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
template<typename Derived>
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
{
call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());

View File

@@ -24,11 +24,17 @@ template<int Rows, int Cols, int Depth> struct product_type_selector;
template<int Size, int MaxSize> struct product_size_category
{
enum { is_large = MaxSize == Dynamic ||
Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD,
value = is_large ? Large
: Size == 1 ? 1
: Small
enum {
#ifndef EIGEN_CUDA_ARCH
is_large = MaxSize == Dynamic ||
Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
(Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
#else
is_large = 0,
#endif
value = is_large ? Large
: Size == 1 ? 1
: Small
};
};
@@ -223,50 +229,65 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
// on, the other hand it is good for the cache to pack the vector anyways...
EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime==1),
ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
MightCannotUseDest = (ActualDest::InnerStrideAtCompileTime!=1) || ComplexByReal
MightCannotUseDest = (!EvalToDestAtCompileTime) || ComplexByReal
};
gemv_static_vector_if<ResScalar,ActualDest::SizeAtCompileTime,ActualDest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
const bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
const bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
evalToDest ? dest.data() : static_dest.data());
if(!evalToDest)
{
#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
Index size = dest.size();
EIGEN_DENSE_STORAGE_CTOR_PLUGIN
#endif
if(!alphaIsCompatible)
{
MappedDest(actualDestPtr, dest.size()).setZero();
compatibleAlpha = RhsScalar(1);
}
else
MappedDest(actualDestPtr, dest.size()) = dest;
}
typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
general_matrix_vector_product
<Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
actualLhs.rows(), actualLhs.cols(),
LhsMapper(actualLhs.data(), actualLhs.outerStride()),
RhsMapper(actualRhs.data(), actualRhs.innerStride()),
actualDestPtr, 1,
compatibleAlpha);
RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
if (!evalToDest)
if(!MightCannotUseDest)
{
if(!alphaIsCompatible)
dest += actualAlpha * MappedDest(actualDestPtr, dest.size());
else
dest = MappedDest(actualDestPtr, dest.size());
// shortcut if we are sure to be able to use dest directly,
// this ease the compiler to generate cleaner and more optimzized code for most common cases
general_matrix_vector_product
<Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
actualLhs.rows(), actualLhs.cols(),
LhsMapper(actualLhs.data(), actualLhs.outerStride()),
RhsMapper(actualRhs.data(), actualRhs.innerStride()),
dest.data(), 1,
compatibleAlpha);
}
else
{
gemv_static_vector_if<ResScalar,ActualDest::SizeAtCompileTime,ActualDest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
const bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
const bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
evalToDest ? dest.data() : static_dest.data());
if(!evalToDest)
{
#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
Index size = dest.size();
EIGEN_DENSE_STORAGE_CTOR_PLUGIN
#endif
if(!alphaIsCompatible)
{
MappedDest(actualDestPtr, dest.size()).setZero();
compatibleAlpha = RhsScalar(1);
}
else
MappedDest(actualDestPtr, dest.size()) = dest;
}
general_matrix_vector_product
<Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
actualLhs.rows(), actualLhs.cols(),
LhsMapper(actualLhs.data(), actualLhs.outerStride()),
RhsMapper(actualRhs.data(), actualRhs.innerStride()),
actualDestPtr, 1,
compatibleAlpha);
if (!evalToDest)
{
if(!alphaIsCompatible)
dest.matrix() += actualAlpha * MappedDest(actualDestPtr, dest.size());
else
dest = MappedDest(actualDestPtr, dest.size());
}
}
}
};
@@ -329,6 +350,7 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,false>
template<typename Lhs, typename Rhs, typename Dest>
static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
{
EIGEN_STATIC_ASSERT((!nested_eval<Lhs,1>::Evaluate),EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE);
// TODO if rhs is large enough it might be beneficial to make sure that dest is sequentially stored in memory, otherwise use a temp
typename nested_eval<Rhs,1>::type actual_rhs(rhs);
const Index size = rhs.rows();
@@ -342,6 +364,7 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
template<typename Lhs, typename Rhs, typename Dest>
static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
{
EIGEN_STATIC_ASSERT((!nested_eval<Lhs,1>::Evaluate),EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE);
typename nested_eval<Rhs,Lhs::RowsAtCompileTime>::type actual_rhs(rhs);
const Index rows = dest.rows();
for(Index i=0; i<rows; ++i)
@@ -361,8 +384,6 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
*
* \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
*/
#ifndef __CUDACC__
template<typename Derived>
template<typename OtherDerived>
inline const Product<Derived, OtherDerived>
@@ -394,8 +415,6 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
return Product<Derived, OtherDerived>(derived(), other.derived());
}
#endif // __CUDACC__
/** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
*
* The returned product will behave like any other expressions: the coefficients of the product will be

View File

@@ -230,7 +230,7 @@ pload1(const typename unpacket_traits<Packet>::type *a) { return pset1<Packet>(
* duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
* Currently, this function is only used for scalar * complex products.
*/
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
/** \internal \returns a packet with elements of \a *from quadrupled.
@@ -278,7 +278,7 @@ inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
}
/** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
template<typename Packet> inline Packet
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
plset(const typename unpacket_traits<Packet>::type& a) { return a; }
/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
@@ -329,7 +329,7 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Pack
*/
template<typename Packet> EIGEN_DEVICE_FUNC inline
typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
predux4(const Packet& a)
predux_downto4(const Packet& a)
{ return a; }
/** \internal \returns the product of the elements of \a a*/
@@ -482,7 +482,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro
* by the current computation.
*/
template<typename Packet, int LoadMode>
inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
{
return ploadt<Packet, LoadMode>(from);
}
@@ -558,6 +558,34 @@ pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket, const Packet& th
return ifPacket.select[0] ? thenPacket : elsePacket;
}
/** \internal \returns \a a with the first coefficient replaced by the scalar b */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pinsertfirst(const Packet& a, typename unpacket_traits<Packet>::type b)
{
// Default implementation based on pblend.
// It must be specialized for higher performance.
Selector<unpacket_traits<Packet>::size> mask;
mask.select[0] = true;
// This for loop should be optimized away by the compiler.
for(Index i=1; i<unpacket_traits<Packet>::size; ++i)
mask.select[i] = false;
return pblend(mask, pset1<Packet>(b), a);
}
/** \internal \returns \a a with the last coefficient replaced by the scalar b */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pinsertlast(const Packet& a, typename unpacket_traits<Packet>::type b)
{
// Default implementation based on pblend.
// It must be specialized for higher performance.
Selector<unpacket_traits<Packet>::size> mask;
// This for loop should be optimized away by the compiler.
for(Index i=0; i<unpacket_traits<Packet>::size-1; ++i)
mask.select[i] = false;
mask.select[unpacket_traits<Packet>::size-1] = true;
return pblend(mask, pset1<Packet>(b), a);
}
} // end namespace internal
} // end namespace Eigen

View File

@@ -105,24 +105,10 @@ class WithFormat
}
protected:
const typename ExpressionType::Nested m_matrix;
typename ExpressionType::Nested m_matrix;
IOFormat m_format;
};
/** \returns a WithFormat proxy object allowing to print a matrix the with given
* format \a fmt.
*
* See class IOFormat for some examples.
*
* \sa class IOFormat, class WithFormat
*/
template<typename Derived>
inline const WithFormat<Derived>
DenseBase<Derived>::format(const IOFormat& fmt) const
{
return WithFormat<Derived>(derived(), fmt);
}
namespace internal {
// NOTE: This helper is kept for backward compatibility with previous code specializing

View File

@@ -45,6 +45,7 @@ class Inverse : public InverseImpl<XprType,typename internal::traits<XprType>::S
public:
typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::PlainObject PlainObject;
typedef typename XprType::Scalar Scalar;
typedef typename internal::ref_selector<XprType>::type XprTypeNested;
typedef typename internal::remove_all<XprTypeNested>::type XprTypeNestedCleaned;
typedef typename internal::ref_selector<Inverse>::type Nested;

View File

@@ -20,11 +20,17 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
{
typedef traits<PlainObjectType> TraitsBase;
enum {
PlainObjectTypeInnerSize = ((traits<PlainObjectType>::Flags&RowMajorBit)==RowMajorBit)
? PlainObjectType::ColsAtCompileTime
: PlainObjectType::RowsAtCompileTime,
InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
? int(PlainObjectType::InnerStrideAtCompileTime)
: int(StrideType::InnerStrideAtCompileTime),
OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
? int(PlainObjectType::OuterStrideAtCompileTime)
? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic
? Dynamic
: int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize))
: int(StrideType::OuterStrideAtCompileTime),
Alignment = int(MapOptions)&int(AlignedMask),
Flags0 = TraitsBase::Flags & (~NestByRefBit),
@@ -107,10 +113,11 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
EIGEN_DEVICE_FUNC
inline Index outerStride() const
{
return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
: IsVectorAtCompileTime ? this->size()
: int(Flags)&RowMajorBit ? this->cols()
: this->rows();
return int(StrideType::OuterStrideAtCompileTime) != 0 ? m_stride.outer()
: int(internal::traits<Map>::OuterStrideAtCompileTime) != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
: IsVectorAtCompileTime ? (this->size() * innerStride())
: (int(Flags)&RowMajorBit) ? (this->cols() * innerStride())
: (this->rows() * innerStride());
}
/** Constructor in the fixed-size case.

View File

@@ -97,6 +97,19 @@ struct real_default_impl<Scalar,true>
template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
#ifdef __CUDA_ARCH__
template<typename T>
struct real_impl<std::complex<T> >
{
typedef T RealScalar;
EIGEN_DEVICE_FUNC
static inline T run(const std::complex<T>& x)
{
return x.real();
}
};
#endif
template<typename Scalar>
struct real_retval
{
@@ -132,6 +145,19 @@ struct imag_default_impl<Scalar,true>
template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
#ifdef __CUDA_ARCH__
template<typename T>
struct imag_impl<std::complex<T> >
{
typedef T RealScalar;
EIGEN_DEVICE_FUNC
static inline T run(const std::complex<T>& x)
{
return x.imag();
}
};
#endif
template<typename Scalar>
struct imag_retval
{
@@ -322,31 +348,7 @@ struct norm1_retval
* Implementation of hypot *
****************************************************************************/
template<typename Scalar>
struct hypot_impl
{
typedef typename NumTraits<Scalar>::Real RealScalar;
static inline RealScalar run(const Scalar& x, const Scalar& y)
{
EIGEN_USING_STD_MATH(abs);
EIGEN_USING_STD_MATH(sqrt);
RealScalar _x = abs(x);
RealScalar _y = abs(y);
Scalar p, qp;
if(_x>_y)
{
p = _x;
qp = _y / p;
}
else
{
p = _y;
qp = _x / p;
}
if(p==RealScalar(0)) return RealScalar(0);
return p * sqrt(RealScalar(1) + qp*qp);
}
};
template<typename Scalar> struct hypot_impl;
template<typename Scalar>
struct hypot_retval
@@ -469,7 +471,7 @@ namespace std_fallback {
typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_USING_STD_MATH(log);
Scalar x1p = RealScalar(1) + x;
return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
}
}
@@ -1035,11 +1037,24 @@ double log(const double &x) { return ::log(x); }
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename NumTraits<T>::Real abs(const T &x) {
typename internal::enable_if<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex,typename NumTraits<T>::Real>::type
abs(const T &x) {
EIGEN_USING_STD_MATH(abs);
return abs(x);
}
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename internal::enable_if<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex),typename NumTraits<T>::Real>::type
abs(const T &x) {
return x;
}
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); }
EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float abs(const float &x) { return ::fabsf(x); }

View File

@@ -71,6 +71,29 @@ T generic_fast_tanh_float(const T& a_x)
return pdiv(p, q);
}
template<typename RealScalar>
EIGEN_STRONG_INLINE
RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y)
{
EIGEN_USING_STD_MATH(sqrt);
RealScalar p, qp;
p = numext::maxi(x,y);
if(p==RealScalar(0)) return RealScalar(0);
qp = numext::mini(y,x) / p;
return p * sqrt(RealScalar(1) + qp*qp);
}
template<typename Scalar>
struct hypot_impl
{
typedef typename NumTraits<Scalar>::Real RealScalar;
static inline RealScalar run(const Scalar& x, const Scalar& y)
{
EIGEN_USING_STD_MATH(abs);
return positive_real_hypot<RealScalar>(abs(x), abs(y));
}
};
} // end namespace internal
} // end namespace Eigen

View File

@@ -160,20 +160,11 @@ template<typename Derived> class MatrixBase
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Derived& operator-=(const MatrixBase<OtherDerived>& other);
#ifdef __CUDACC__
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
const Product<Derived,OtherDerived,LazyProduct>
operator*(const MatrixBase<OtherDerived> &other) const
{ return this->lazyProduct(other); }
#else
template<typename OtherDerived>
const Product<Derived,OtherDerived>
operator*(const MatrixBase<OtherDerived> &other) const;
#endif
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
const Product<Derived,OtherDerived,LazyProduct>
@@ -294,7 +285,7 @@ template<typename Derived> class MatrixBase
* fuzzy comparison such as isApprox()
* \sa isApprox(), operator!= */
template<typename OtherDerived>
inline bool operator==(const MatrixBase<OtherDerived>& other) const
EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase<OtherDerived>& other) const
{ return cwiseEqual(other).all(); }
/** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other.
@@ -302,7 +293,7 @@ template<typename Derived> class MatrixBase
* fuzzy comparison such as isApprox()
* \sa isApprox(), operator== */
template<typename OtherDerived>
inline bool operator!=(const MatrixBase<OtherDerived>& other) const
EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
{ return cwiseNotEqual(other).any(); }
NoAlias<Derived,Eigen::MatrixBase > noalias();
@@ -399,12 +390,14 @@ template<typename Derived> class MatrixBase
EIGEN_DEVICE_FUNC
inline PlainObject unitOrthogonal(void) const;
EIGEN_DEVICE_FUNC
inline Matrix<Scalar,3,1> eulerAngles(Index a0, Index a1, Index a2) const;
// put this as separate enum value to work around possible GCC 4.3 bug (?)
enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1&&RowsAtCompileTime==1 ? ((internal::traits<Derived>::Flags&RowMajorBit)==RowMajorBit ? Horizontal : Vertical)
: ColsAtCompileTime==1 ? Vertical : Horizontal };
typedef Homogeneous<Derived, HomogeneousReturnTypeDirection> HomogeneousReturnType;
EIGEN_DEVICE_FUNC
inline HomogeneousReturnType homogeneous() const;
enum {
@@ -414,7 +407,7 @@ template<typename Derived> class MatrixBase
internal::traits<Derived>::ColsAtCompileTime==1 ? SizeMinusOne : 1,
internal::traits<Derived>::ColsAtCompileTime==1 ? 1 : SizeMinusOne> ConstStartMinusOne;
typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(ConstStartMinusOne,Scalar,quotient) HNormalizedReturnType;
EIGEN_DEVICE_FUNC
inline const HNormalizedReturnType hnormalized() const;
////////// Householder module ///////////

View File

@@ -215,6 +215,8 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
EIGEN_DEVICE_FUNC
static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
static inline int digits10() { return NumTraits<Scalar>::digits10(); }
};
template<> struct NumTraits<std::string>

View File

@@ -41,7 +41,7 @@ template<> struct check_rows_cols_for_overflow<Dynamic> {
{
// http://hg.mozilla.org/mozilla-central/file/6c8a909977d3/xpcom/ds/CheckedInt.h#l242
// we assume Index is signed
Index max_index = (size_t(1) << (8 * sizeof(Index) - 1)) - 1; // assume Index is signed
Index max_index = (std::size_t(1) << (8 * sizeof(Index) - 1)) - 1; // assume Index is signed
bool error = (rows == 0 || cols == 0) ? false
: (rows > max_index / cols);
if (error)
@@ -58,6 +58,28 @@ template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers> struct m
} // end namespace internal
#ifdef EIGEN_PARSED_BY_DOXYGEN
namespace doxygen {
// This is a workaround to doxygen not being able to understand the inheritance logic
// when it is hidden by the dense_xpr_base helper struct.
// Moreover, doxygen fails to include members that are not documented in the declaration body of
// MatrixBase if we inherits MatrixBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >,
// this is why we simply inherits MatrixBase, though this does not make sense.
/** This class is just a workaround for Doxygen and it does not not actually exist. */
template<typename Derived> struct dense_xpr_base_dispatcher;
/** This class is just a workaround for Doxygen and it does not not actually exist. */
template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
struct dense_xpr_base_dispatcher<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
: public MatrixBase {};
/** This class is just a workaround for Doxygen and it does not not actually exist. */
template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
struct dense_xpr_base_dispatcher<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
: public ArrayBase {};
} // namespace doxygen
/** \class PlainObjectBase
* \ingroup Core_Module
* \brief %Dense storage base class for matrices and arrays.
@@ -65,26 +87,10 @@ template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers> struct m
* This class can be extended with the help of the plugin mechanism described on the page
* \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_PLAINOBJECTBASE_PLUGIN.
*
* \tparam Derived is the derived type, e.g., a Matrix or Array
*
* \sa \ref TopicClassHierarchy
*/
#ifdef EIGEN_PARSED_BY_DOXYGEN
namespace doxygen {
// this is a workaround to doxygen not being able to understand the inheritance logic
// when it is hidden by the dense_xpr_base helper struct.
/** This class is just a workaround for Doxygen and it does not not actually exist. */
template<typename Derived> struct dense_xpr_base_dispatcher;
/** This class is just a workaround for Doxygen and it does not not actually exist. */
template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
struct dense_xpr_base_dispatcher<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
: public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > {};
/** This class is just a workaround for Doxygen and it does not not actually exist. */
template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
struct dense_xpr_base_dispatcher<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
: public ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > {};
} // namespace doxygen
template<typename Derived>
class PlainObjectBase : public doxygen::dense_xpr_base_dispatcher<Derived>
#else
@@ -554,7 +560,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
public:
/** \copydoc DenseBase::operator=(const EigenBase<OtherDerived>&)
/** \brief Copies the generic expression \a other into *this.
* \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
*/
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
@@ -570,6 +577,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
* while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
* \a data pointers.
*
* Here is an example using strides:
* \include Matrix_Map_stride.cpp
* Output: \verbinclude Matrix_Map_stride.out
*
* \see class Map
*/
//@{
@@ -763,6 +774,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
{
// NOTE MSVC 2008 complains if we directly put bool(NumTraits<T>::IsInteger) as the EIGEN_STATIC_ASSERT argument.
const bool is_integer = NumTraits<T>::IsInteger;
EIGEN_UNUSED_VARIABLE(is_integer);
EIGEN_STATIC_ASSERT(is_integer,
FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
resize(size);
@@ -804,6 +816,13 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
this->_set_noalias(other);
}
// Initialize an arbitrary matrix from an object convertible to the Derived type.
template<typename T>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE void _init1(const Derived& other){
this->_set_noalias(other);
}
// Initialize an arbitrary matrix from a generic Eigen expression
template<typename T, typename OtherDerived>
EIGEN_DEVICE_FUNC
@@ -826,7 +845,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
this->derived() = r;
}
// For fixed -size arrays:
// For fixed-size Array<Scalar,...>
template<typename T>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE void _init1(const Scalar& val0,
@@ -838,6 +857,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
Base::setConstant(val0);
}
// For fixed-size Array<Index,...>
template<typename T>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE void _init1(const Index& val0,
@@ -916,8 +936,8 @@ struct conservative_resize_like_impl
{
// The storage order does not allow us to use reallocation.
typename Derived::PlainObject tmp(rows,cols);
const Index common_rows = (std::min)(rows, _this.rows());
const Index common_cols = (std::min)(cols, _this.cols());
const Index common_rows = numext::mini(rows, _this.rows());
const Index common_cols = numext::mini(cols, _this.cols());
tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
_this.derived().swap(tmp);
}
@@ -950,8 +970,8 @@ struct conservative_resize_like_impl
{
// The storage order does not allow us to use reallocation.
typename Derived::PlainObject tmp(other);
const Index common_rows = (std::min)(tmp.rows(), _this.rows());
const Index common_cols = (std::min)(tmp.cols(), _this.cols());
const Index common_rows = numext::mini(tmp.rows(), _this.rows());
const Index common_cols = numext::mini(tmp.cols(), _this.cols());
tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
_this.derived().swap(tmp);
}

View File

@@ -97,8 +97,8 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option,
&& "if you wanted a coeff-wise or a dot product use the respective explicit functions");
}
EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); }
EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; }
EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; }
@@ -127,7 +127,7 @@ public:
using Base::derived;
typedef typename Base::Scalar Scalar;
operator const Scalar() const
EIGEN_STRONG_INLINE operator const Scalar() const
{
return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
}
@@ -162,7 +162,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
public:
EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const
{
EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
@@ -170,7 +170,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
return internal::evaluator<Derived>(derived()).coeff(row,col);
}
EIGEN_DEVICE_FUNC Scalar coeff(Index i) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index i) const
{
EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );

View File

@@ -32,7 +32,7 @@ struct evaluator<Product<Lhs, Rhs, Options> >
typedef Product<Lhs, Rhs, Options> XprType;
typedef product_evaluator<XprType> Base;
EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
};
// Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
@@ -55,7 +55,7 @@ struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
const Product<Lhs, Rhs, DefaultProduct> > XprType;
typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> > Base;
EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
: Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs())
{}
};
@@ -68,7 +68,7 @@ struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> >
typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
: Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
xpr.index() ))
@@ -140,6 +140,10 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scal
static EIGEN_STRONG_INLINE
void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
{
Index dstRows = src.rows();
Index dstCols = src.cols();
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
dst.resize(dstRows, dstCols);
// FIXME shall we handle nested_eval here?
generic_product_impl<Lhs, Rhs>::evalTo(dst, src.lhs(), src.rhs());
}
@@ -154,6 +158,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<
static EIGEN_STRONG_INLINE
void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &)
{
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
// FIXME shall we handle nested_eval here?
generic_product_impl<Lhs, Rhs>::addTo(dst, src.lhs(), src.rhs());
}
@@ -168,6 +173,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<
static EIGEN_STRONG_INLINE
void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &)
{
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
// FIXME shall we handle nested_eval here?
generic_product_impl<Lhs, Rhs>::subTo(dst, src.lhs(), src.rhs());
}
@@ -201,6 +207,12 @@ struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename
static const bool value = true;
};
template<typename OtherXpr, typename Lhs, typename Rhs>
struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_difference_op<typename OtherXpr::Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, const OtherXpr,
const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
static const bool value = true;
};
template<typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2>
struct assignment_from_xpr_op_product
{
@@ -234,19 +246,19 @@ template<typename Lhs, typename Rhs>
struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
{
template<typename Dst>
static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
}
template<typename Dst>
static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
}
template<typename Dst>
static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
};
@@ -265,7 +277,7 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const
// FIXME not very good if rhs is real and lhs complex while alpha is real too
const Index cols = dst.cols();
for (Index j=0; j<cols; ++j)
func(dst.col(j), rhsEval.coeff(0,j) * actual_lhs);
func(dst.col(j), rhsEval.coeff(Index(0),j) * actual_lhs);
}
// Row major result
@@ -278,7 +290,7 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const
// FIXME not very good if lhs is real and rhs complex while alpha is real too
const Index rows = dst.rows();
for (Index i=0; i<rows; ++i)
func(dst.row(i), lhsEval.coeff(i,0) * actual_rhs);
func(dst.row(i), lhsEval.coeff(i,Index(0)) * actual_rhs);
}
template<typename Lhs, typename Rhs>
@@ -300,25 +312,25 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
};
template<typename Dst>
static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
}
template<typename Dst>
static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
}
template<typename Dst>
static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
}
template<typename Dst>
static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
{
internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
}
@@ -354,17 +366,21 @@ template<typename Lhs, typename Rhs>
struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct> >
{
typedef typename nested_eval<Lhs,1>::type LhsNested;
typedef typename nested_eval<Rhs,1>::type RhsNested;
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight };
typedef typename internal::conditional<int(Side)==OnTheRight,Lhs,Rhs>::type MatrixType;
typedef typename internal::remove_all<typename internal::conditional<int(Side)==OnTheRight,LhsNested,RhsNested>::type>::type MatrixType;
template<typename Dest>
static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
{
LhsNested actual_lhs(lhs);
RhsNested actual_rhs(rhs);
internal::gemv_dense_selector<Side,
(int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor,
bool(internal::blas_traits<MatrixType>::HasUsableDirectAccess)
>::run(lhs, rhs, dst, alpha);
>::run(actual_lhs, actual_rhs, dst, alpha);
}
};
@@ -437,6 +453,18 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::AddCost);
EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
#if 0
std::cerr << "LhsOuterStrideBytes= " << LhsOuterStrideBytes << "\n";
std::cerr << "RhsOuterStrideBytes= " << RhsOuterStrideBytes << "\n";
std::cerr << "LhsAlignment= " << LhsAlignment << "\n";
std::cerr << "RhsAlignment= " << RhsAlignment << "\n";
std::cerr << "CanVectorizeLhs= " << CanVectorizeLhs << "\n";
std::cerr << "CanVectorizeRhs= " << CanVectorizeRhs << "\n";
std::cerr << "CanVectorizeInner= " << CanVectorizeInner << "\n";
std::cerr << "EvalToRowMajor= " << EvalToRowMajor << "\n";
std::cerr << "Alignment= " << Alignment << "\n";
std::cerr << "Flags= " << Flags << "\n";
#endif
}
// Everything below here is taken from CoeffBasedProduct.h
@@ -503,8 +531,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),
Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
: bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<=0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
: bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<=0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
: 0,
/* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
@@ -555,8 +583,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
}
protected:
const LhsNested m_lhs;
const RhsNested m_rhs;
typename internal::add_const_on_value_type<LhsNested>::type m_lhs;
typename internal::add_const_on_value_type<RhsNested>::type m_rhs;
LhsEtorType m_lhsImpl;
RhsEtorType m_rhsImpl;
@@ -590,7 +618,7 @@ struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
{
etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
res = pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex-1)), rhs.template packet<LoadMode,Packet>(UnrollingIndex-1, col), res);
res = pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex-1))), rhs.template packet<LoadMode,Packet>(Index(UnrollingIndex-1), col), res);
}
};
@@ -600,7 +628,7 @@ struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
{
etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
res = pmadd(lhs.template packet<LoadMode,Packet>(row, UnrollingIndex-1), pset1<Packet>(rhs.coeff(UnrollingIndex-1, col)), res);
res = pmadd(lhs.template packet<LoadMode,Packet>(row, Index(UnrollingIndex-1)), pset1<Packet>(rhs.coeff(Index(UnrollingIndex-1), col)), res);
}
};
@@ -609,7 +637,7 @@ struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
{
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
{
res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode,Packet>(0, col));
res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),rhs.template packet<LoadMode,Packet>(Index(0), col));
}
};
@@ -618,7 +646,7 @@ struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
{
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
{
res = pmul(lhs.template packet<LoadMode,Packet>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
res = pmul(lhs.template packet<LoadMode,Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
}
};
@@ -627,7 +655,7 @@ struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
{
static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
{
res = pset1<Packet>(0);
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
}
};
@@ -636,7 +664,7 @@ struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
{
static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
{
res = pset1<Packet>(0);
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
}
};
@@ -645,7 +673,7 @@ struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
{
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
{
res = pset1<Packet>(0);
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
for(Index i = 0; i < innerDim; ++i)
res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode,Packet>(i, col), res);
}
@@ -656,7 +684,7 @@ struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
{
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
{
res = pset1<Packet>(0);
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
for(Index i = 0; i < innerDim; ++i)
res = pmadd(lhs.template packet<LoadMode,Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
}
@@ -757,7 +785,11 @@ public:
_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
_LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
Alignment = evaluator<MatrixType>::Alignment
Alignment = evaluator<MatrixType>::Alignment,
AsScalarProduct = (DiagonalType::SizeAtCompileTime==1)
|| (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft)
|| (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
};
diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
@@ -769,7 +801,10 @@ public:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
{
return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
if(AsScalarProduct)
return m_diagImpl.coeff(0) * m_matImpl.coeff(idx);
else
return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
}
protected:

View File

@@ -407,7 +407,7 @@ protected:
*/
template<typename Derived>
template<typename Func>
typename internal::traits<Derived>::Scalar
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
DenseBase<Derived>::redux(const Func& func) const
{
eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");

View File

@@ -95,6 +95,8 @@ protected:
template<typename Expression>
EIGEN_DEVICE_FUNC void construct(Expression& expr)
{
EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(PlainObjectType,Expression);
if(PlainObjectType::RowsAtCompileTime==1)
{
eigen_assert(expr.rows()==1 || expr.cols()==1);

View File

@@ -45,7 +45,7 @@ struct traits<SelfAdjointView<MatrixType, UpLo> > : traits<MatrixType>
};
}
// FIXME could also be called SelfAdjointWrapper to be consistent with DiagonalWrapper ??
template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
: public TriangularBase<SelfAdjointView<_MatrixType, UpLo> >
{
@@ -60,16 +60,20 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
/** \brief The type of coefficients in this matrix */
typedef typename internal::traits<SelfAdjointView>::Scalar Scalar;
typedef typename MatrixType::StorageIndex StorageIndex;
typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
enum {
Mode = internal::traits<SelfAdjointView>::Mode,
Flags = internal::traits<SelfAdjointView>::Flags
Flags = internal::traits<SelfAdjointView>::Flags,
TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0)
};
typedef typename MatrixType::PlainObject PlainObject;
EIGEN_DEVICE_FUNC
explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
{}
{
EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY);
}
EIGEN_DEVICE_FUNC
inline Index rows() const { return m_matrix.rows(); }
@@ -187,6 +191,36 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type(tmp2);
}
typedef SelfAdjointView<const MatrixConjugateReturnType,UpLo> ConjugateReturnType;
/** \sa MatrixBase::conjugate() const */
EIGEN_DEVICE_FUNC
inline const ConjugateReturnType conjugate() const
{ return ConjugateReturnType(m_matrix.conjugate()); }
typedef SelfAdjointView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
/** \sa MatrixBase::adjoint() const */
EIGEN_DEVICE_FUNC
inline const AdjointReturnType adjoint() const
{ return AdjointReturnType(m_matrix.adjoint()); }
typedef SelfAdjointView<typename MatrixType::TransposeReturnType,TransposeMode> TransposeReturnType;
/** \sa MatrixBase::transpose() */
EIGEN_DEVICE_FUNC
inline TransposeReturnType transpose()
{
EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
typename MatrixType::TransposeReturnType tmp(m_matrix);
return TransposeReturnType(tmp);
}
typedef SelfAdjointView<const typename MatrixType::ConstTransposeReturnType,TransposeMode> ConstTransposeReturnType;
/** \sa MatrixBase::transpose() const */
EIGEN_DEVICE_FUNC
inline const ConstTransposeReturnType transpose() const
{
return ConstTransposeReturnType(m_matrix.transpose());
}
/** \returns a const expression of the main diagonal of the matrix \c *this
*
* This method simply returns the diagonal of the nested expression, thus by-passing the SelfAdjointView decorator.
@@ -287,6 +321,7 @@ public:
* Implementation of MatrixBase methods
***************************************************************************/
/** This is the const version of MatrixBase::selfadjointView() */
template<typename Derived>
template<unsigned int UpLo>
typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
@@ -295,6 +330,15 @@ MatrixBase<Derived>::selfadjointView() const
return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
}
/** \returns an expression of a symmetric/self-adjoint view extracted from the upper or lower triangular part of the current matrix
*
* The parameter \a UpLo can be either \c #Upper or \c #Lower
*
* Example: \include MatrixBase_selfadjointView.cpp
* Output: \verbinclude MatrixBase_selfadjointView.out
*
* \sa class SelfAdjointView
*/
template<typename Derived>
template<unsigned int UpLo>
typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type

View File

@@ -15,33 +15,29 @@ namespace Eigen {
// TODO generalize the scalar type of 'other'
template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
{
typedef typename Derived::PlainObject PlainObject;
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>());
return derived();
}
template<typename Derived>
EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
{
typedef typename Derived::PlainObject PlainObject;
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>());
return derived();
}
template<typename Derived>
EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
{
typedef typename Derived::PlainObject PlainObject;
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>());
return derived();
}
template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
{
typedef typename Derived::PlainObject PlainObject;
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>());
return derived();
}

View File

@@ -34,12 +34,12 @@ template<typename Decomposition, typename RhsType,typename StorageKind> struct s
template<typename Decomposition, typename RhsType>
struct solve_traits<Decomposition,RhsType,Dense>
{
typedef Matrix<typename RhsType::Scalar,
typedef typename make_proper_matrix_type<typename RhsType::Scalar,
Decomposition::ColsAtCompileTime,
RhsType::ColsAtCompileTime,
RhsType::PlainObject::Options,
Decomposition::MaxColsAtCompileTime,
RhsType::MaxColsAtCompileTime> PlainObject;
RhsType::MaxColsAtCompileTime>::type PlainObject;
};
template<typename Decomposition, typename RhsType>
@@ -139,7 +139,11 @@ struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar
typedef Solve<DecType,RhsType> SrcXprType;
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
{
// FIXME shall we resize dst here?
Index dstRows = src.rows();
Index dstCols = src.cols();
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
dst.resize(dstRows, dstCols);
src.dec()._solve_impl(src.rhs(), dst);
}
};
@@ -151,6 +155,11 @@ struct Assignment<DstXprType, Solve<Transpose<const DecType>,RhsType>, internal:
typedef Solve<Transpose<const DecType>,RhsType> SrcXprType;
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
{
Index dstRows = src.rows();
Index dstCols = src.cols();
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
dst.resize(dstRows, dstCols);
src.dec().nestedExpression().template _solve_impl_transposed<false>(src.rhs(), dst);
}
};
@@ -163,6 +172,11 @@ struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<t
typedef Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType> SrcXprType;
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
{
Index dstRows = src.rows();
Index dstCols = src.cols();
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
dst.resize(dstRows, dstCols);
src.dec().nestedExpression().nestedExpression().template _solve_impl_transposed<true>(src.rhs(), dst);
}
};

View File

@@ -161,6 +161,7 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
* TriangularView methods
***************************************************************************/
#ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename MatrixType, unsigned int Mode>
template<int Side, typename OtherDerived>
void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
@@ -188,6 +189,7 @@ TriangularViewImpl<Derived,Mode,Dense>::solve(const MatrixBase<Other>& other) co
{
return internal::triangular_solve_retval<Side,TriangularViewType,Other>(derived(), other.derived());
}
#endif
namespace internal {

View File

@@ -165,12 +165,13 @@ MatrixBase<Derived>::stableNorm() const
typedef typename internal::nested_eval<Derived,2>::type DerivedCopy;
typedef typename internal::remove_all<DerivedCopy>::type DerivedCopyClean;
DerivedCopy copy(derived());
const DerivedCopy copy(derived());
enum {
CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit)
|| (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT) // ifwe cannot allocate on the stack, then let's not bother about this optimization
) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
&& (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
};
typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;

View File

@@ -78,6 +78,11 @@ template<typename MatrixType> class Transpose
typename internal::remove_reference<MatrixTypeNested>::type&
nestedExpression() { return m_matrix; }
/** \internal */
void resize(Index nrows, Index ncols) {
m_matrix.resize(ncols,nrows);
}
protected:
typename internal::ref_selector<MatrixType>::non_const_type m_matrix;
};

View File

@@ -384,7 +384,7 @@ class Transpose<TranspositionsBase<TranspositionsDerived> >
const Product<OtherDerived, Transpose, AliasFreeProduct>
operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trt)
{
return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt.derived());
return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt);
}
/** \returns the \a matrix with the inverse transpositions applied to the rows.

View File

@@ -470,6 +470,8 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
* \a Side==OnTheLeft (the default), or the right-inverse-multiply \a other * inverse(\c *this) if
* \a Side==OnTheRight.
*
* Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
*
* The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
* diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
* is an upper (resp. lower) triangular matrix.
@@ -495,6 +497,8 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
* \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
* This function will const_cast it, so constness isn't honored here.
*
* Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
*
* See TriangularView:solve() for the details.
*/
template<int Side, typename OtherDerived>
@@ -539,13 +543,14 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
template<typename ProductType>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha);
EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha, bool beta);
};
/***************************************************************************
* Implementation of triangular evaluation/assignment
***************************************************************************/
#ifndef EIGEN_PARSED_BY_DOXYGEN
// FIXME should we keep that possibility
template<typename MatrixType, unsigned int Mode>
template<typename OtherDerived>
@@ -583,6 +588,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBas
eigen_assert(Mode == int(OtherDerived::Mode));
internal::call_assignment_no_alias(derived(), other.derived());
}
#endif
/***************************************************************************
* Implementation of TriangularBase methods
@@ -641,21 +647,20 @@ MatrixBase<Derived>::triangularView() const
template<typename Derived>
bool MatrixBase<Derived>::isUpperTriangular(const RealScalar& prec) const
{
using std::abs;
RealScalar maxAbsOnUpperPart = static_cast<RealScalar>(-1);
for(Index j = 0; j < cols(); ++j)
{
Index maxi = (std::min)(j, rows()-1);
Index maxi = numext::mini(j, rows()-1);
for(Index i = 0; i <= maxi; ++i)
{
RealScalar absValue = abs(coeff(i,j));
RealScalar absValue = numext::abs(coeff(i,j));
if(absValue > maxAbsOnUpperPart) maxAbsOnUpperPart = absValue;
}
}
RealScalar threshold = maxAbsOnUpperPart * prec;
for(Index j = 0; j < cols(); ++j)
for(Index i = j+1; i < rows(); ++i)
if(abs(coeff(i, j)) > threshold) return false;
if(numext::abs(coeff(i, j)) > threshold) return false;
return true;
}
@@ -667,20 +672,19 @@ bool MatrixBase<Derived>::isUpperTriangular(const RealScalar& prec) const
template<typename Derived>
bool MatrixBase<Derived>::isLowerTriangular(const RealScalar& prec) const
{
using std::abs;
RealScalar maxAbsOnLowerPart = static_cast<RealScalar>(-1);
for(Index j = 0; j < cols(); ++j)
for(Index i = j; i < rows(); ++i)
{
RealScalar absValue = abs(coeff(i,j));
RealScalar absValue = numext::abs(coeff(i,j));
if(absValue > maxAbsOnLowerPart) maxAbsOnLowerPart = absValue;
}
RealScalar threshold = maxAbsOnLowerPart * prec;
for(Index j = 1; j < cols(); ++j)
{
Index maxi = (std::min)(j, rows()-1);
Index maxi = numext::mini(j, rows()-1);
for(Index i = 0; i < maxi; ++i)
if(abs(coeff(i, j)) > threshold) return false;
if(numext::abs(coeff(i, j)) > threshold) return false;
}
return true;
}
@@ -777,15 +781,18 @@ public:
template<int Mode, bool SetOpposite, typename DstXprType, typename SrcXprType, typename Functor>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void call_triangular_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func)
void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func)
{
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
typedef evaluator<DstXprType> DstEvaluatorType;
typedef evaluator<SrcXprType> SrcEvaluatorType;
DstEvaluatorType dstEvaluator(dst);
SrcEvaluatorType srcEvaluator(src);
Index dstRows = src.rows();
Index dstCols = src.cols();
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
dst.resize(dstRows, dstCols);
DstEvaluatorType dstEvaluator(dst);
typedef triangular_dense_assignment_kernel< Mode&(Lower|Upper),Mode&(UnitDiag|ZeroDiag|SelfAdjoint),SetOpposite,
DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
@@ -802,7 +809,7 @@ void call_triangular_assignment_loop(const DstXprType& dst, const SrcXprType& sr
template<int Mode, bool SetOpposite, typename DstXprType, typename SrcXprType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void call_triangular_assignment_loop(const DstXprType& dst, const SrcXprType& src)
void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src)
{
call_triangular_assignment_loop<Mode,SetOpposite>(dst, src, internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
}
@@ -893,7 +900,7 @@ struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
{
for(Index j = 0; j < kernel.cols(); ++j)
{
Index maxi = (std::min)(j, kernel.rows());
Index maxi = numext::mini(j, kernel.rows());
Index i = 0;
if (((Mode&Lower) && SetOpposite) || (Mode&Upper))
{
@@ -938,8 +945,12 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_
typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename SrcXprType::Scalar> &)
{
dst.setZero();
dst._assignProduct(src, 1);
Index dstRows = src.rows();
Index dstCols = src.cols();
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
dst.resize(dstRows, dstCols);
dst._assignProduct(src, 1, 0);
}
};
@@ -950,7 +961,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_ass
typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,typename SrcXprType::Scalar> &)
{
dst._assignProduct(src, 1);
dst._assignProduct(src, 1, 1);
}
};
@@ -961,7 +972,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::sub_ass
typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,typename SrcXprType::Scalar> &)
{
dst._assignProduct(src, -1);
dst._assignProduct(src, -1, 1);
}
};

View File

@@ -602,7 +602,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
return m_matrix / extendedTo(other.derived());
}
/** \returns an expression where each column of row of the referenced matrix are normalized.
/** \returns an expression where each column (or row) of the referenced matrix are normalized.
* The referenced matrix is \b not modified.
* \sa MatrixBase::normalized(), normalize()
*/
@@ -625,6 +625,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
/////////// Geometry module ///////////
typedef Homogeneous<ExpressionType,Direction> HomogeneousReturnType;
EIGEN_DEVICE_FUNC
HomogeneousReturnType homogeneous() const;
typedef typename ExpressionType::PlainObject CrossReturnType;
@@ -654,6 +655,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
Direction==Horizontal ? HNormalized_SizeMinusOne : 1> >
HNormalizedReturnType;
EIGEN_DEVICE_FUNC
const HNormalizedReturnType hnormalized() const;
protected:

View File

@@ -194,7 +194,8 @@ struct functor_traits<max_coeff_visitor<Scalar> > {
} // end namespace internal
/** \returns the minimum of all coefficients of *this and puts in *row and *col its location.
/** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
* \returns the minimum of all coefficients of *this and puts in *row and *col its location.
* \warning the result is undefined if \c *this contains NaN.
*
* \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
@@ -230,7 +231,8 @@ DenseBase<Derived>::minCoeff(IndexType* index) const
return minVisitor.res;
}
/** \returns the maximum of all coefficients of *this and puts in *row and *col its location.
/** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
* \returns the maximum of all coefficients of *this and puts in *row and *col its location.
* \warning the result is undefined if \c *this contains NaN.
*
* \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()

View File

@@ -204,23 +204,7 @@ template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
}
};
template<> struct conj_helper<Packet8f, Packet4cf, false,false>
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
{ return Packet4cf(Eigen::internal::pmul(x, y.v)); }
};
template<> struct conj_helper<Packet4cf, Packet8f, false,false>
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
{ return Packet4cf(Eigen::internal::pmul(x.v, y)); }
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f)
template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
{
@@ -400,23 +384,7 @@ template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
}
};
template<> struct conj_helper<Packet4d, Packet2cd, false,false>
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
{ return Packet2cd(Eigen::internal::pmul(x, y.v)); }
};
template<> struct conj_helper<Packet2cd, Packet4d, false,false>
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
{ return Packet2cd(Eigen::internal::pmul(x.v, y)); }
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d)
template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
{
@@ -456,6 +424,26 @@ ptranspose(PacketBlock<Packet2cd,2>& kernel) {
kernel.packet[0].v = tmp;
}
template<> EIGEN_STRONG_INLINE Packet4cf pinsertfirst(const Packet4cf& a, std::complex<float> b)
{
return Packet4cf(_mm256_blend_ps(a.v,pset1<Packet4cf>(b).v,1|2));
}
template<> EIGEN_STRONG_INLINE Packet2cd pinsertfirst(const Packet2cd& a, std::complex<double> b)
{
return Packet2cd(_mm256_blend_pd(a.v,pset1<Packet2cd>(b).v,1|2));
}
template<> EIGEN_STRONG_INLINE Packet4cf pinsertlast(const Packet4cf& a, std::complex<float> b)
{
return Packet4cf(_mm256_blend_ps(a.v,pset1<Packet4cf>(b).v,(1<<7)|(1<<6)));
}
template<> EIGEN_STRONG_INLINE Packet2cd pinsertlast(const Packet2cd& a, std::complex<double> b)
{
return Packet2cd(_mm256_blend_pd(a.v,pset1<Packet2cd>(b).v,(1<<3)|(1<<2)));
}
} // end namespace internal
} // end namespace Eigen

View File

@@ -355,30 +355,27 @@ pexp<Packet4d>(const Packet4d& _x) {
// Functions for sqrt.
// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
// exact solution. The main advantage of this approach is not just speed, but
// also the fact that it can be inlined and pipelined with other computations,
// further reducing its effective latency.
// exact solution. It does not handle +inf, or denormalized numbers correctly.
// The main advantage of this approach is not just speed, but also the fact that
// it can be inlined and pipelined with other computations, further reducing its
// effective latency. This is similar to Quake3's fast inverse square root.
// For detail see here: http://www.beyond3d.com/content/articles/8/
#if EIGEN_FAST_MATH
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
psqrt<Packet8f>(const Packet8f& _x) {
_EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
_EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
_EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
Packet8f neg_half = pmul(_x, p8f_minus_half);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
Packet8f non_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_GE_OQ);
Packet8f x = _mm256_and_ps(non_zero_mask, _mm256_rsqrt_ps(_x));
Packet8f half = pmul(_x, pset1<Packet8f>(.5f));
Packet8f denormal_mask = _mm256_and_ps(
_mm256_cmp_ps(_x, pset1<Packet8f>((std::numeric_limits<float>::min)()),
_CMP_LT_OQ),
_mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_GE_OQ));
// Compute approximate reciprocal sqrt.
Packet8f x = _mm256_rsqrt_ps(_x);
// Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five));
// Multiply the original _x by it's reciprocal square root to extract the
// square root.
return pmul(_x, x);
x = pmul(x, psub(pset1<Packet8f>(1.5f), pmul(half, pmul(x,x))));
// Flush results for denormals to zero.
return _mm256_andnot_ps(denormal_mask, pmul(_x,x));
}
#else
template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED

View File

@@ -48,7 +48,9 @@ template<> struct is_arithmetic<__m256d> { enum { value = true }; };
#define _EIGEN_DECLARE_CONST_Packet8i(NAME,X) \
const Packet8i p8i_##NAME = pset1<Packet8i>(X)
// Use the packet_traits defined in AVX512/PacketMath.h instead if we're going
// to leverage AVX512 instructions.
#ifndef EIGEN_VECTORIZE_AVX512
template<> struct packet_traits<float> : default_packet_traits
{
typedef Packet8f type;
@@ -93,6 +95,7 @@ template<> struct packet_traits<double> : default_packet_traits
HasCeil = 1
};
};
#endif
template<> struct scalar_div_cost<float,true> { enum { value = 14 }; };
template<> struct scalar_div_cost<double,true> { enum { value = 16 }; };
@@ -304,9 +307,11 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
pstore(to, pa);
}
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
#ifndef EIGEN_VECTORIZE_AVX512
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
#endif
template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) {
return _mm_cvtss_f32(_mm256_castps256_ps128(a));
@@ -328,9 +333,12 @@ template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
{
__m256d tmp = _mm256_shuffle_pd(a,a,5);
return _mm256_permute2f128_pd(tmp, tmp, 1);
#if 0
// This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
// exhibit the same latency/throughput, but it is here for future reference/benchmarking...
__m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
return _mm256_permute_pd(swap_halves,5);
#endif
}
// pabs should be ok
@@ -390,17 +398,14 @@ template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs)
template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
{
Packet8f tmp0 = _mm256_hadd_ps(a,_mm256_permute2f128_ps(a,a,1));
tmp0 = _mm256_hadd_ps(tmp0,tmp0);
return pfirst(_mm256_hadd_ps(tmp0, tmp0));
return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1))));
}
template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
{
Packet4d tmp0 = _mm256_hadd_pd(a,_mm256_permute2f128_pd(a,a,1));
return pfirst(_mm256_hadd_pd(tmp0,tmp0));
return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
}
template<> EIGEN_STRONG_INLINE Packet4f predux4<Packet8f>(const Packet8f& a)
template<> EIGEN_STRONG_INLINE Packet4f predux_downto4<Packet8f>(const Packet8f& a)
{
return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
}
@@ -604,6 +609,26 @@ template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, cons
return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
}
template<> EIGEN_STRONG_INLINE Packet8f pinsertfirst(const Packet8f& a, float b)
{
return _mm256_blend_ps(a,pset1<Packet8f>(b),1);
}
template<> EIGEN_STRONG_INLINE Packet4d pinsertfirst(const Packet4d& a, double b)
{
return _mm256_blend_pd(a,pset1<Packet4d>(b),1);
}
template<> EIGEN_STRONG_INLINE Packet8f pinsertlast(const Packet8f& a, float b)
{
return _mm256_blend_ps(a,pset1<Packet8f>(b),(1<<7));
}
template<> EIGEN_STRONG_INLINE Packet4d pinsertlast(const Packet4d& a, double b)
{
return _mm256_blend_pd(a,pset1<Packet4d>(b),(1<<3));
}
} // end namespace internal
} // end namespace Eigen

View File

@@ -0,0 +1,391 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Pedro Gonnet (pedro.gonnet@gmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
#define THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
namespace Eigen {
namespace internal {
// Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics.
#if EIGEN_GNUC_AT_LEAST(5, 3)
#define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \
const Packet16f p16f_##NAME = pset1<Packet16f>(X)
#define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \
const Packet16f p16f_##NAME = (__m512)pset1<Packet16i>(X)
#define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \
const Packet8d p8d_##NAME = pset1<Packet8d>(X)
#define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))
// Natural logarithm
// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
// be easily approximated by a polynomial centered on m=1 for stability.
#if defined(EIGEN_VECTORIZE_AVX512DQ)
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
plog<Packet16f>(const Packet16f& _x) {
Packet16f x = _x;
_EIGEN_DECLARE_CONST_Packet16f(1, 1.0f);
_EIGEN_DECLARE_CONST_Packet16f(half, 0.5f);
_EIGEN_DECLARE_CONST_Packet16f(126f, 126.0f);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inv_mant_mask, ~0x7f800000);
// The smallest non denormalized float number.
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
// Polynomial coefficients.
_EIGEN_DECLARE_CONST_Packet16f(cephes_SQRTHF, 0.707106781186547524f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p0, 7.0376836292E-2f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p1, -1.1514610310E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p2, 1.1676998740E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p3, -1.2420140846E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p4, +1.4249322787E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p5, -1.6668057665E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p6, +2.0000714765E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p7, -2.4999993993E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p8, +3.3333331174E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_q1, -2.12194440e-4f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f);
// invalid_mask is set to true when x is NaN
__mmask16 invalid_mask =
_mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
__mmask16 iszero_mask =
_mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_UQ);
// Truncate input values to the minimum positive normal.
x = pmax(x, p16f_min_norm_pos);
// Extract the shifted exponents.
Packet16f emm0 = _mm512_cvtepi32_ps(_mm512_srli_epi32((__m512i)x, 23));
Packet16f e = _mm512_sub_ps(emm0, p16f_126f);
// Set the exponents to -1, i.e. x are in the range [0.5,1).
x = _mm512_and_ps(x, p16f_inv_mant_mask);
x = _mm512_or_ps(x, p16f_half);
// part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
// and shift by -1. The values are then centered around 0, which improves
// the stability of the polynomial evaluation.
// if( x < SQRTHF ) {
// e -= 1;
// x = x + x - 1.0;
// } else { x = x - 1.0; }
__mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ);
Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x);
x = psub(x, p16f_1);
e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1));
x = padd(x, tmp);
Packet16f x2 = pmul(x, x);
Packet16f x3 = pmul(x2, x);
// Evaluate the polynomial approximant of degree 8 in three parts, probably
// to improve instruction-level parallelism.
Packet16f y, y1, y2;
y = pmadd(p16f_cephes_log_p0, x, p16f_cephes_log_p1);
y1 = pmadd(p16f_cephes_log_p3, x, p16f_cephes_log_p4);
y2 = pmadd(p16f_cephes_log_p6, x, p16f_cephes_log_p7);
y = pmadd(y, x, p16f_cephes_log_p2);
y1 = pmadd(y1, x, p16f_cephes_log_p5);
y2 = pmadd(y2, x, p16f_cephes_log_p8);
y = pmadd(y, x3, y1);
y = pmadd(y, x3, y2);
y = pmul(y, x3);
// Add the logarithm of the exponent back to the result of the interpolation.
y1 = pmul(e, p16f_cephes_log_q1);
tmp = pmul(x2, p16f_half);
y = padd(y, y1);
x = psub(x, tmp);
y2 = pmul(e, p16f_cephes_log_q2);
x = padd(x, y);
x = padd(x, y2);
// Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
return _mm512_mask_blend_ps(iszero_mask,
_mm512_mask_blend_ps(invalid_mask, x, p16f_nan),
p16f_minus_inf);
}
#endif
// Exponential function. Works by writing "x = m*log(2) + r" where
// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
pexp<Packet16f>(const Packet16f& _x) {
_EIGEN_DECLARE_CONST_Packet16f(1, 1.0f);
_EIGEN_DECLARE_CONST_Packet16f(half, 0.5f);
_EIGEN_DECLARE_CONST_Packet16f(127, 127.0f);
_EIGEN_DECLARE_CONST_Packet16f(exp_hi, 88.3762626647950f);
_EIGEN_DECLARE_CONST_Packet16f(exp_lo, -88.3762626647949f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_LOG2EF, 1.44269504088896341f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p0, 1.9875691500E-4f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p1, 1.3981999507E-3f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p2, 8.3334519073E-3f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p3, 4.1665795894E-2f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p4, 1.6666665459E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p5, 5.0000001201E-1f);
// Clamp x.
Packet16f x = pmax(pmin(_x, p16f_exp_hi), p16f_exp_lo);
// Express exp(x) as exp(m*ln(2) + r), start by extracting
// m = floor(x/ln(2) + 0.5).
Packet16f m = _mm512_floor_ps(pmadd(x, p16f_cephes_LOG2EF, p16f_half));
// Get r = x - m*ln(2). Note that we can do this without losing more than one
// ulp precision due to the FMA instruction.
_EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453f);
Packet16f r = _mm512_fmadd_ps(m, p16f_nln2, x);
Packet16f r2 = pmul(r, r);
// TODO(gonnet): Split into odd/even polynomials and try to exploit
// instruction-level parallelism.
Packet16f y = p16f_cephes_exp_p0;
y = pmadd(y, r, p16f_cephes_exp_p1);
y = pmadd(y, r, p16f_cephes_exp_p2);
y = pmadd(y, r, p16f_cephes_exp_p3);
y = pmadd(y, r, p16f_cephes_exp_p4);
y = pmadd(y, r, p16f_cephes_exp_p5);
y = pmadd(y, r2, r);
y = padd(y, p16f_1);
// Build emm0 = 2^m.
Packet16i emm0 = _mm512_cvttps_epi32(padd(m, p16f_127));
emm0 = _mm512_slli_epi32(emm0, 23);
// Return 2^m * exp(r).
return pmax(pmul(y, _mm512_castsi512_ps(emm0)), _x);
}
/*template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
pexp<Packet8d>(const Packet8d& _x) {
Packet8d x = _x;
_EIGEN_DECLARE_CONST_Packet8d(1, 1.0);
_EIGEN_DECLARE_CONST_Packet8d(2, 2.0);
_EIGEN_DECLARE_CONST_Packet8d(exp_hi, 709.437);
_EIGEN_DECLARE_CONST_Packet8d(exp_lo, -709.436139303);
_EIGEN_DECLARE_CONST_Packet8d(cephes_LOG2EF, 1.4426950408889634073599);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p0, 1.26177193074810590878e-4);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p1, 3.02994407707441961300e-2);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p2, 9.99999999999999999910e-1);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q0, 3.00198505138664455042e-6);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q1, 2.52448340349684104192e-3);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q2, 2.27265548208155028766e-1);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q3, 2.00000000000000000009e0);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C1, 0.693145751953125);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C2, 1.42860682030941723212e-6);
// clamp x
x = pmax(pmin(x, p8d_exp_hi), p8d_exp_lo);
// Express exp(x) as exp(g + n*log(2)).
const Packet8d n =
_mm512_mul_round_pd(p8d_cephes_LOG2EF, x, _MM_FROUND_TO_NEAREST_INT);
// Get the remainder modulo log(2), i.e. the "g" described above. Subtract
// n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
// digits right.
const Packet8d nC1 = pmul(n, p8d_cephes_exp_C1);
const Packet8d nC2 = pmul(n, p8d_cephes_exp_C2);
x = psub(x, nC1);
x = psub(x, nC2);
const Packet8d x2 = pmul(x, x);
// Evaluate the numerator polynomial of the rational interpolant.
Packet8d px = p8d_cephes_exp_p0;
px = pmadd(px, x2, p8d_cephes_exp_p1);
px = pmadd(px, x2, p8d_cephes_exp_p2);
px = pmul(px, x);
// Evaluate the denominator polynomial of the rational interpolant.
Packet8d qx = p8d_cephes_exp_q0;
qx = pmadd(qx, x2, p8d_cephes_exp_q1);
qx = pmadd(qx, x2, p8d_cephes_exp_q2);
qx = pmadd(qx, x2, p8d_cephes_exp_q3);
// I don't really get this bit, copied from the SSE2 routines, so...
// TODO(gonnet): Figure out what is going on here, perhaps find a better
// rational interpolant?
x = _mm512_div_pd(px, psub(qx, px));
x = pmadd(p8d_2, x, p8d_1);
// Build e=2^n.
const Packet8d e = _mm512_castsi512_pd(_mm512_slli_epi64(
_mm512_add_epi64(_mm512_cvtpd_epi64(n), _mm512_set1_epi64(1023)), 52));
// Construct the result 2^n * exp(g) = e * x. The max is used to catch
// non-finite values in the input.
return pmax(pmul(x, e), _x);
}*/
// Functions for sqrt.
// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
// exact solution. The main advantage of this approach is not just speed, but
// also the fact that it can be inlined and pipelined with other computations,
// further reducing its effective latency.
#if EIGEN_FAST_MATH
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
psqrt<Packet16f>(const Packet16f& _x) {
_EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
_EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
Packet16f neg_half = pmul(_x, p16f_minus_half);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_setzero_ps(), _mm512_rsqrt14_ps(_x));
// Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
// Multiply the original _x by it's reciprocal square root to extract the
// square root.
return pmul(_x, x);
}
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
psqrt<Packet8d>(const Packet8d& _x) {
_EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
_EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
Packet8d neg_half = pmul(_x, p8d_minus_half);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_setzero_pd(), _mm512_rsqrt14_pd(_x));
// Do a first step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
// Do a second step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
// Multiply the original _x by it's reciprocal square root to extract the
// square root.
return pmul(_x, x);
}
#else
template <>
EIGEN_STRONG_INLINE Packet16f psqrt<Packet16f>(const Packet16f& x) {
return _mm512_sqrt_ps(x);
}
template <>
EIGEN_STRONG_INLINE Packet8d psqrt<Packet8d>(const Packet8d& x) {
return _mm512_sqrt_pd(x);
}
#endif
// Functions for rsqrt.
// Almost identical to the sqrt routine, just leave out the last multiplication
// and fill in NaN/Inf where needed. Note that this function only exists as an
// iterative version for doubles since there is no instruction for diretly
// computing the reciprocal square root in AVX-512.
#ifdef EIGEN_FAST_MATH
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
prsqrt<Packet16f>(const Packet16f& _x) {
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
_EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
_EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
Packet16f neg_half = pmul(_x, p16f_minus_half);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps());
// Fill in NaNs and Infs for the negative/zero entries.
__mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
Packet16f infs_and_nans = _mm512_mask_blend_ps(
neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan);
// Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
// Insert NaNs and Infs in all the right places.
return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans);
}
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
prsqrt<Packet8d>(const Packet8d& _x) {
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL);
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(nan, 0x7ff1000000000000LL);
_EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
_EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
Packet8d neg_half = pmul(_x, p8d_minus_half);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd());
// Fill in NaNs and Infs for the negative/zero entries.
__mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
Packet8d infs_and_nans = _mm512_mask_blend_pd(
neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan);
// Do a first step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
// Do a second step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
// Insert NaNs and Infs in all the right places.
return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans);
}
#elif defined(EIGEN_VECTORIZE_AVX512ER)
template <>
EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
return _mm512_rsqrt28_ps(x);
}
#endif
#endif
} // end namespace internal
} // end namespace Eigen
#endif // THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_

File diff suppressed because it is too large Load Diff

View File

@@ -15,14 +15,14 @@ namespace Eigen {
namespace internal {
static Packet4ui p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_ZERO_);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
static Packet4ui p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
#ifdef __VSX__
#if defined(_BIG_ENDIAN)
static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
#else
static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
#endif
#endif
@@ -65,7 +65,7 @@ template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type;
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
{
Packet2cf res;
if((ptrdiff_t(&from) % 16) == 0)
if((std::ptrdiff_t(&from) % 16) == 0)
res.v = pload<Packet4f>((const float *)&from);
else
res.v = ploadu<Packet4f>((const float *)&from);
@@ -224,23 +224,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
}
};
template<> struct conj_helper<Packet4f, Packet2cf, false,false>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
{ return Packet2cf(internal::pmul<Packet4f>(x, y.v)); }
};
template<> struct conj_helper<Packet2cf, Packet4f, false,false>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
{ return Packet2cf(internal::pmul<Packet4f>(x.v, y)); }
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
@@ -416,23 +400,8 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
return pconj(internal::pmul(a, b));
}
};
template<> struct conj_helper<Packet2d, Packet1cd, false,false>
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
{ return Packet1cd(internal::pmul<Packet2d>(x, y.v)); }
};
template<> struct conj_helper<Packet1cd, Packet2d, false,false>
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
{ return Packet1cd(internal::pmul<Packet2d>(x.v, y)); }
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{

View File

@@ -84,8 +84,10 @@ static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
#ifdef __POWER8_VECTOR__
static Packet2l p2l_1023 = { 1023, 1023 };
static Packet2ul p2ul_52 = { 52, 52 };
#endif
#endif

View File

@@ -72,7 +72,7 @@ static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
#ifndef __VSX__
static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
#endif
@@ -90,7 +90,7 @@ static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
#define _EIGEN_MASK_ALIGNMENT 0xfffffff0
#endif
#define _EIGEN_ALIGNED_PTR(x) ((ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
// Handle endianness properly while loading constants
// Define global static constants:
@@ -103,7 +103,7 @@ static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4u
static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
#else
static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
@@ -358,7 +358,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_
template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_ZERO); }
template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); }
template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return a * b; }
template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
@@ -373,7 +373,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const
t = vec_nmsub(y_0, b, p4f_ONE);
y_1 = vec_madd(y_0, t, y_0);
return vec_madd(a, y_1, p4f_ZERO);
return vec_madd(a, y_1, p4f_MZERO);
#else
return vec_div(a, b);
#endif
@@ -388,10 +388,28 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
{
#ifdef __VSX__
Packet4f ret;
__asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
return ret;
#else
return vec_min(a, b);
#endif
}
template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
{
#ifdef __VSX__
Packet4f ret;
__asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
return ret;
#else
return vec_max(a, b);
#endif
}
template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
@@ -450,15 +468,15 @@ template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
{
Packet4f p;
if((ptrdiff_t(from) % 16) == 0) p = pload<Packet4f>(from);
else p = ploadu<Packet4f>(from);
if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4f>(from);
else p = ploadu<Packet4f>(from);
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
}
template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
{
Packet4i p;
if((ptrdiff_t(from) % 16) == 0) p = pload<Packet4i>(from);
else p = ploadu<Packet4i>(from);
if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4i>(from);
else p = ploadu<Packet4i>(from);
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
}
@@ -764,9 +782,9 @@ typedef __vector __bool long Packet2bl;
static Packet2l p2l_ONE = { 1, 1 };
static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
static Packet2d p2d_ONE = { 1.0, 1.0 };
static Packet2d p2d_ONE = { 1.0, 1.0 };
static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
static Packet2d p2d_ZERO_ = { -0.0, -0.0 };
static Packet2d p2d_MZERO = { -0.0, -0.0 };
#ifdef _BIG_ENDIAN
static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
@@ -904,15 +922,25 @@ template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_
template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_ZERO); }
template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_MZERO); }
template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); }
// for some weird raisons, it has to be overloaded for packet of integers
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
{
Packet2d ret;
__asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
return ret;
}
template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
{
Packet2d ret;
__asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
return ret;
}
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
@@ -935,8 +963,8 @@ template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
{
Packet2d p;
if((ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from);
else p = ploadu<Packet2d>(from);
if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from);
else p = ploadu<Packet2d>(from);
return vec_splat_dbl<0>(p);
}
@@ -969,7 +997,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
Packet2d v[2], sum;
v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8));
v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8));
#ifdef _BIG_ENDIAN
sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8));
#else
@@ -1022,7 +1050,7 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE));
Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) );
return vec_sel(elsePacket, thenPacket, mask);
}
#endif // __VSX__

View File

@@ -24,34 +24,43 @@ namespace internal {
// compile. Here, we manually specialize these functors for complex types when
// building for CUDA to avoid non-constexpr methods.
template<typename T> struct scalar_sum_op<std::complex<T>> {
// Sum
template<typename T> struct scalar_sum_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
typedef typename std::complex<T> result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
return std::complex<T>(numext::real(a) + numext::real(b),
numext::imag(a) + numext::imag(b));
}
};
template<typename T> struct scalar_difference_op<std::complex<T>> {
template<typename T> struct scalar_sum_op<std::complex<T>, std::complex<T> > : scalar_sum_op<const std::complex<T>, const std::complex<T> > {};
// Difference
template<typename T> struct scalar_difference_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
typedef typename std::complex<T> result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
return std::complex<T>(numext::real(a) - numext::real(b),
numext::imag(a) - numext::imag(b));
}
};
template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T>> {
template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T> > : scalar_difference_op<const std::complex<T>, const std::complex<T> > {};
// Product
template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
enum {
Vectorizable = packet_traits<std::complex<T>>::HasMul
};
typedef typename std::complex<T> result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
const T a_real = numext::real(a);
const T a_imag = numext::imag(a);
const T b_real = numext::real(b);
@@ -61,14 +70,18 @@ template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T>>
}
};
template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T>> {
template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> > : scalar_product_op<const std::complex<T>, const std::complex<T> > {};
// Quotient
template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
enum {
Vectorizable = packet_traits<std::complex<T>>::HasDiv
};
typedef typename std::complex<T> result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
const T a_real = numext::real(a);
const T a_imag = numext::imag(a);
const T b_real = numext::real(b);
@@ -79,6 +92,8 @@ template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T>>
}
};
template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T> > : scalar_quotient_op<const std::complex<T>, const std::complex<T> > {};
#endif
} // end namespace internal

View File

@@ -13,7 +13,7 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
@@ -147,55 +147,55 @@ namespace half_impl {
// versions to get the ALU speed increased), but you do save the
// conversion steps back and forth.
__device__ half operator + (const half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
return __hadd(a, b);
}
__device__ half operator * (const half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
return __hmul(a, b);
}
__device__ half operator - (const half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
return __hsub(a, b);
}
__device__ half operator / (const half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
float num = __half2float(a);
float denom = __half2float(b);
return __float2half(num / denom);
}
__device__ half operator - (const half& a) {
EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
return __hneg(a);
}
__device__ half& operator += (half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
a = a + b;
return a;
}
__device__ half& operator *= (half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) {
a = a * b;
return a;
}
__device__ half& operator -= (half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) {
a = a - b;
return a;
}
__device__ half& operator /= (half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
a = a / b;
return a;
}
__device__ bool operator == (const half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
return __heq(a, b);
}
__device__ bool operator != (const half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
return __hne(a, b);
}
__device__ bool operator < (const half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
return __hlt(a, b);
}
__device__ bool operator <= (const half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
return __hle(a, b);
}
__device__ bool operator > (const half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
return __hgt(a, b);
}
__device__ bool operator >= (const half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
return __hge(a, b);
}
@@ -238,10 +238,10 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b)
return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
return float(a) == float(b);
return numext::equal_strict(float(a),float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
return float(a) != float(b);
return numext::not_equal_strict(float(a), float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
return float(a) < float(b);
@@ -386,11 +386,15 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
return result;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
return half(::expf(float(a)));
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
return half(hexp(a));
#else
return half(::expf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return Eigen::half(::hlog(a));
#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return half(::hlog(a));
#else
return half(::logf(float(a)));
#endif
@@ -402,7 +406,11 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
return half(::log10f(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
return half(::sqrtf(float(a)));
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
return half(hsqrt(a));
#else
return half(::sqrtf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
return half(::powf(float(a), float(b)));
@@ -420,10 +428,18 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
return half(::tanhf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
return half(hfloor(a));
#else
return half(::floorf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
return half(hceil(a));
#else
return half(::ceilf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
@@ -474,9 +490,59 @@ template<> struct is_arithmetic<half> { enum { value = true }; };
} // end namespace internal
} // end namespace Eigen
namespace std {
template<>
struct numeric_limits<Eigen::half> {
static const bool is_specialized = true;
static const bool is_signed = true;
static const bool is_integer = false;
static const bool is_exact = false;
static const bool has_infinity = true;
static const bool has_quiet_NaN = true;
static const bool has_signaling_NaN = true;
static const float_denorm_style has_denorm = denorm_present;
static const bool has_denorm_loss = false;
static const std::float_round_style round_style = std::round_to_nearest;
static const bool is_iec559 = false;
static const bool is_bounded = false;
static const bool is_modulo = false;
static const int digits = 11;
static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
static const int radix = 2;
static const int min_exponent = -13;
static const int min_exponent10 = -4;
static const int max_exponent = 16;
static const int max_exponent10 = 4;
static const bool traps = true;
static const bool tinyness_before = false;
static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); }
static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); }
static Eigen::half round_error() { return Eigen::half(0.5); }
static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
};
}
namespace Eigen {
template<> struct NumTraits<Eigen::half>
: GenericNumTraits<Eigen::half>
{
enum {
IsSigned = true,
IsInteger = false,
IsComplex = false,
RequireInitialization = false
};
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
return half_impl::raw_uint16_to_half(0x0800);
}
@@ -507,7 +573,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
return Eigen::half(::expf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return Eigen::half(::hlog(a));
#else
return Eigen::half(::logf(float(a)));

View File

@@ -291,7 +291,7 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<float4,4>& kernel) {
double tmp = kernel.packet[0].y;
float tmp = kernel.packet[0].y;
kernel.packet[0].y = kernel.packet[1].x;
kernel.packet[1].x = tmp;

View File

@@ -15,7 +15,7 @@ namespace Eigen {
namespace internal {
// Most of the following operations require arch >= 3.0
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
template<> struct is_arithmetic<half2> { enum { value = true }; };
@@ -41,15 +41,15 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
return __half2half2(from);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
return *reinterpret_cast<const half2*>(from);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
return __halves2half2(from[0], from[1]);
}
@@ -57,17 +57,17 @@ template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) {
return __halves2half2(from[0], from[0]);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
*reinterpret_cast<half2*>(to) = from;
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
to[0] = __low2half(from);
to[1] = __high2half(from);
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
#if __CUDA_ARCH__ >= 350
return __ldg((const half2*)from);
#else
@@ -76,7 +76,7 @@ template<>
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
#if __CUDA_ARCH__ >= 350
return __halves2half2(__ldg(from+0), __ldg(from+1));
#else
@@ -84,27 +84,27 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Ei
#endif
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
return __halves2half2(from[0*stride], from[1*stride]);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
to[stride*0] = __low2half(from);
to[stride*1] = __high2half(from);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
return __low2half(a);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
half2 result;
result.x = a.x & 0x7FFF7FFF;
return result;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
__device__ EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<half2,2>& kernel) {
__half a1 = __low2half(kernel.packet[0]);
__half a2 = __high2half(kernel.packet[0]);
@@ -114,7 +114,7 @@ ptranspose(PacketBlock<half2,2>& kernel) {
kernel.packet[1] = __halves2half2(a2, b2);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
#if __CUDA_ARCH__ >= 530
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
#else
@@ -123,7 +123,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen:
#endif
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
#if __CUDA_ARCH__ >= 530
return __hadd2(a, b);
#else
@@ -137,7 +137,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2&
#endif
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
#if __CUDA_ARCH__ >= 530
return __hsub2(a, b);
#else
@@ -151,7 +151,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2&
#endif
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
#if __CUDA_ARCH__ >= 530
return __hneg2(a);
#else
@@ -161,9 +161,9 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
#endif
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
#if __CUDA_ARCH__ >= 530
return __hmul2(a, b);
#else
@@ -177,7 +177,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2&
#endif
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
#if __CUDA_ARCH__ >= 530
return __hfma2(a, b, c);
#else
@@ -193,7 +193,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2&
#endif
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
@@ -203,7 +203,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2&
return __floats2half2_rn(r1, r2);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
@@ -213,7 +213,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2&
return __halves2half2(r1, r2);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
@@ -223,7 +223,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2&
return __halves2half2(r1, r2);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
#if __CUDA_ARCH__ >= 530
return __hadd(__low2half(a), __high2half(a));
#else
@@ -233,7 +233,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const
#endif
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
#if __CUDA_ARCH__ >= 530
__half first = __low2half(a);
__half second = __high2half(a);
@@ -245,7 +245,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(c
#endif
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
#if __CUDA_ARCH__ >= 530
__half first = __low2half(a);
__half second = __high2half(a);
@@ -257,7 +257,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(c
#endif
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
#if __CUDA_ARCH__ >= 530
return __hmul(__low2half(a), __high2half(a));
#else
@@ -267,7 +267,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(c
#endif
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float r1 = log1pf(a1);
@@ -275,31 +275,31 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2
return __floats2half2_rn(r1, r2);
}
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
template<> __device__ EIGEN_STRONG_INLINE
half2 plog<half2>(const half2& a) {
return h2log(a);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
template<> __device__ EIGEN_STRONG_INLINE
half2 pexp<half2>(const half2& a) {
return h2exp(a);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
template<> __device__ EIGEN_STRONG_INLINE
half2 psqrt<half2>(const half2& a) {
return h2sqrt(a);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
template<> __device__ EIGEN_STRONG_INLINE
half2 prsqrt<half2>(const half2& a) {
return h2rsqrt(a);
}
#else
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float r1 = logf(a1);
@@ -307,7 +307,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2&
return __floats2half2_rn(r1, r2);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float r1 = expf(a1);
@@ -315,7 +315,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2&
return __floats2half2_rn(r1, r2);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float r1 = sqrtf(a1);
@@ -323,7 +323,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2&
return __floats2half2_rn(r1, r2);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float r1 = rsqrtf(a1);
@@ -333,6 +333,374 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2
#endif
#elif defined EIGEN_VECTORIZE_AVX512
typedef struct {
__m256i x;
} Packet16h;
template<> struct is_arithmetic<Packet16h> { enum { value = true }; };
template <>
struct packet_traits<half> : default_packet_traits {
typedef Packet16h type;
// There is no half-size packet for Packet16h.
typedef Packet16h half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 16,
HasHalfPacket = 0,
HasAdd = 0,
HasSub = 0,
HasMul = 0,
HasNegate = 0,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasConj = 0,
HasSetLinear = 0,
HasDiv = 0,
HasSqrt = 0,
HasRsqrt = 0,
HasExp = 0,
HasLog = 0,
HasBlend = 0
};
};
template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; };
template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
Packet16h result;
result.x = _mm256_set1_epi16(from.x);
return result;
}
template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from.x, 0)));
}
template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
Packet16h result;
result.x = _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
return result;
}
template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
Packet16h result;
result.x = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
return result;
}
template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
_mm256_store_si256((__m256i*)to, from.x);
}
template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
_mm256_storeu_si256((__m256i*)to, from.x);
}
template<> EIGEN_STRONG_INLINE Packet16h
ploadquad(const Eigen::half* from) {
Packet16h result;
unsigned short a = from[0].x;
unsigned short b = from[1].x;
unsigned short c = from[2].x;
unsigned short d = from[3].x;
result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
return result;
}
EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
#ifdef EIGEN_HAS_FP16_C
return _mm512_cvtph_ps(a.x);
#else
EIGEN_ALIGN64 half aux[16];
pstore(aux, a);
float f0(aux[0]);
float f1(aux[1]);
float f2(aux[2]);
float f3(aux[3]);
float f4(aux[4]);
float f5(aux[5]);
float f6(aux[6]);
float f7(aux[7]);
float f8(aux[8]);
float f9(aux[9]);
float fa(aux[10]);
float fb(aux[11]);
float fc(aux[12]);
float fd(aux[13]);
float fe(aux[14]);
float ff(aux[15]);
return _mm512_set_ps(
ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
#endif
}
EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
#ifdef EIGEN_HAS_FP16_C
Packet16h result;
result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
return result;
#else
EIGEN_ALIGN64 float aux[16];
pstore(aux, a);
half h0(aux[0]);
half h1(aux[1]);
half h2(aux[2]);
half h3(aux[3]);
half h4(aux[4]);
half h5(aux[5]);
half h6(aux[6]);
half h7(aux[7]);
half h8(aux[8]);
half h9(aux[9]);
half ha(aux[10]);
half hb(aux[11]);
half hc(aux[12]);
half hd(aux[13]);
half he(aux[14]);
half hf(aux[15]);
Packet16h result;
result.x = _mm256_set_epi16(
hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
return result;
#endif
}
template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
Packet16f af = half2float(a);
Packet16f bf = half2float(b);
Packet16f rf = padd(af, bf);
return float2half(rf);
}
template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
Packet16f af = half2float(a);
Packet16f bf = half2float(b);
Packet16f rf = pmul(af, bf);
return float2half(rf);
}
template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
Packet16f from_float = half2float(from);
return half(predux(from_float));
}
template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
{
Packet16h result;
result.x = _mm256_set_epi16(
from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
return result;
}
template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride)
{
EIGEN_ALIGN64 half aux[16];
pstore(aux, from);
to[stride*0].x = aux[0].x;
to[stride*1].x = aux[1].x;
to[stride*2].x = aux[2].x;
to[stride*3].x = aux[3].x;
to[stride*4].x = aux[4].x;
to[stride*5].x = aux[5].x;
to[stride*6].x = aux[6].x;
to[stride*7].x = aux[7].x;
to[stride*8].x = aux[8].x;
to[stride*9].x = aux[9].x;
to[stride*10].x = aux[10].x;
to[stride*11].x = aux[11].x;
to[stride*12].x = aux[12].x;
to[stride*13].x = aux[13].x;
to[stride*14].x = aux[14].x;
to[stride*15].x = aux[15].x;
}
EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<Packet16h,16>& kernel) {
__m256i a = kernel.packet[0].x;
__m256i b = kernel.packet[1].x;
__m256i c = kernel.packet[2].x;
__m256i d = kernel.packet[3].x;
__m256i e = kernel.packet[4].x;
__m256i f = kernel.packet[5].x;
__m256i g = kernel.packet[6].x;
__m256i h = kernel.packet[7].x;
__m256i i = kernel.packet[8].x;
__m256i j = kernel.packet[9].x;
__m256i k = kernel.packet[10].x;
__m256i l = kernel.packet[11].x;
__m256i m = kernel.packet[12].x;
__m256i n = kernel.packet[13].x;
__m256i o = kernel.packet[14].x;
__m256i p = kernel.packet[15].x;
__m256i ab_07 = _mm256_unpacklo_epi16(a, b);
__m256i cd_07 = _mm256_unpacklo_epi16(c, d);
__m256i ef_07 = _mm256_unpacklo_epi16(e, f);
__m256i gh_07 = _mm256_unpacklo_epi16(g, h);
__m256i ij_07 = _mm256_unpacklo_epi16(i, j);
__m256i kl_07 = _mm256_unpacklo_epi16(k, l);
__m256i mn_07 = _mm256_unpacklo_epi16(m, n);
__m256i op_07 = _mm256_unpacklo_epi16(o, p);
__m256i ab_8f = _mm256_unpackhi_epi16(a, b);
__m256i cd_8f = _mm256_unpackhi_epi16(c, d);
__m256i ef_8f = _mm256_unpackhi_epi16(e, f);
__m256i gh_8f = _mm256_unpackhi_epi16(g, h);
__m256i ij_8f = _mm256_unpackhi_epi16(i, j);
__m256i kl_8f = _mm256_unpackhi_epi16(k, l);
__m256i mn_8f = _mm256_unpackhi_epi16(m, n);
__m256i op_8f = _mm256_unpackhi_epi16(o, p);
__m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
__m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
__m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
__m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
__m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
__m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
__m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
__m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
__m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
__m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
__m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
__m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
__m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
__m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
__m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
__m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
__m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
__m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
__m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
__m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
__m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
__m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
__m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
__m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
__m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
__m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
__m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
__m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
__m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
__m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
__m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
__m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
// NOTE: no unpacklo/hi instr in this case, so using permute instr.
__m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
__m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
__m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
__m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
__m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
__m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
__m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
__m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
__m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
__m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
__m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
__m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
__m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
__m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
__m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
__m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
kernel.packet[0].x = a_p_0;
kernel.packet[1].x = a_p_1;
kernel.packet[2].x = a_p_2;
kernel.packet[3].x = a_p_3;
kernel.packet[4].x = a_p_4;
kernel.packet[5].x = a_p_5;
kernel.packet[6].x = a_p_6;
kernel.packet[7].x = a_p_7;
kernel.packet[8].x = a_p_8;
kernel.packet[9].x = a_p_9;
kernel.packet[10].x = a_p_a;
kernel.packet[11].x = a_p_b;
kernel.packet[12].x = a_p_c;
kernel.packet[13].x = a_p_d;
kernel.packet[14].x = a_p_e;
kernel.packet[15].x = a_p_f;
}
EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<Packet16h,8>& kernel) {
EIGEN_ALIGN64 half in[8][16];
pstore<half>(in[0], kernel.packet[0]);
pstore<half>(in[1], kernel.packet[1]);
pstore<half>(in[2], kernel.packet[2]);
pstore<half>(in[3], kernel.packet[3]);
pstore<half>(in[4], kernel.packet[4]);
pstore<half>(in[5], kernel.packet[5]);
pstore<half>(in[6], kernel.packet[6]);
pstore<half>(in[7], kernel.packet[7]);
EIGEN_ALIGN64 half out[8][16];
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
out[i][j] = in[j][2*i];
}
for (int j = 0; j < 8; ++j) {
out[i][j+8] = in[j][2*i+1];
}
}
kernel.packet[0] = pload<Packet16h>(out[0]);
kernel.packet[1] = pload<Packet16h>(out[1]);
kernel.packet[2] = pload<Packet16h>(out[2]);
kernel.packet[3] = pload<Packet16h>(out[3]);
kernel.packet[4] = pload<Packet16h>(out[4]);
kernel.packet[5] = pload<Packet16h>(out[5]);
kernel.packet[6] = pload<Packet16h>(out[6]);
kernel.packet[7] = pload<Packet16h>(out[7]);
}
EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<Packet16h,4>& kernel) {
EIGEN_ALIGN64 half in[4][16];
pstore<half>(in[0], kernel.packet[0]);
pstore<half>(in[1], kernel.packet[1]);
pstore<half>(in[2], kernel.packet[2]);
pstore<half>(in[3], kernel.packet[3]);
EIGEN_ALIGN64 half out[4][16];
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
out[i][j] = in[j][4*i];
}
for (int j = 0; j < 4; ++j) {
out[i][j+4] = in[j][4*i+1];
}
for (int j = 0; j < 4; ++j) {
out[i][j+8] = in[j][4*i+2];
}
for (int j = 0; j < 4; ++j) {
out[i][j+12] = in[j][4*i+3];
}
}
kernel.packet[0] = pload<Packet16h>(out[0]);
kernel.packet[1] = pload<Packet16h>(out[1]);
kernel.packet[2] = pload<Packet16h>(out[2]);
kernel.packet[3] = pload<Packet16h>(out[3]);
}
#elif defined EIGEN_VECTORIZE_AVX
typedef struct {
@@ -492,6 +860,30 @@ template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half*
to[stride*7].x = aux[7].x;
}
template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
Packet8f af = half2float(a);
float reduced = predux<Packet8f>(af);
return Eigen::half(reduced);
}
template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
Packet8f af = half2float(a);
float reduced = predux_max<Packet8f>(af);
return Eigen::half(reduced);
}
template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
Packet8f af = half2float(a);
float reduced = predux_min<Packet8f>(af);
return Eigen::half(reduced);
}
template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
Packet8f af = half2float(a);
float reduced = predux_mul<Packet8f>(af);
return Eigen::half(reduced);
}
EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<Packet8h,8>& kernel) {
__m128i a = kernel.packet[0].x;

View File

@@ -100,6 +100,33 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(cons
return __floats2half2_rn(a.x, a.y);
}
#elif defined EIGEN_VECTORIZE_AVX512
template <>
struct type_casting_traits<half, float> {
enum {
VectorizedCast = 1,
SrcCoeffRatio = 1,
TgtCoeffRatio = 1
};
};
template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
return half2float(a);
}
template <>
struct type_casting_traits<float, half> {
enum {
VectorizedCast = 1,
SrcCoeffRatio = 1,
TgtCoeffRatio = 1
};
};
template<> EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
return float2half(a);
}
#elif defined EIGEN_VECTORIZE_AVX
template <>

View File

@@ -0,0 +1,29 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_ARCH_CONJ_HELPER_H
#define EIGEN_ARCH_CONJ_HELPER_H
#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \
template<> struct conj_helper<PACKET_REAL, PACKET_CPLX, false,false> { \
EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \
{ return padd(c, pmul(x,y)); } \
EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const \
{ return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v)); } \
}; \
\
template<> struct conj_helper<PACKET_CPLX, PACKET_REAL, false,false> { \
EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \
{ return padd(c, pmul(x,y)); } \
EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const \
{ return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y)); } \
};
#endif // EIGEN_ARCH_CONJ_HELPER_H

View File

@@ -16,8 +16,14 @@ namespace Eigen {
namespace internal {
inline uint32x4_t p4ui_CONJ_XOR() {
// See bug 1325, clang fails to call vld1q_u64.
#if EIGEN_COMP_CLANG
uint32x4_t ret = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
return ret;
#else
static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
return vld1q_u32( conj_XOR_DATA );
#endif
}
inline uint32x2_t p2ui_CONJ_XOR() {
@@ -61,7 +67,7 @@ template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type;
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
{
float32x2_t r64;
r64 = vld1_f32((float *)&from);
r64 = vld1_f32((const float *)&from);
return Packet2cf(vcombine_f32(r64, r64));
}
@@ -136,7 +142,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
}
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ARM_PREFETCH((float *)addr); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ARM_PREFETCH((const float *)addr); }
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
{
@@ -259,6 +265,8 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
}
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
// TODO optimize it for NEON
@@ -269,7 +277,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
s = vmulq_f32(b.v, b.v);
rev_s = vrev64q_f32(s);
return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
return Packet2cf(pdiv<Packet4f>(res.v, vaddq_f32(s,rev_s)));
}
EIGEN_DEVICE_FUNC inline void
@@ -282,8 +290,13 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
//---------- double ----------
#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
const uint64_t p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 };
static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );
// See bug 1325, clang fails to call vld1q_u64.
#if EIGEN_COMP_CLANG
static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000};
#else
const uint64_t p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 };
static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );
#endif
struct Packet1cd
{
@@ -370,7 +383,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ARM_PREFETCH((double *)addr); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ARM_PREFETCH((const double *)addr); }
template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
{
@@ -445,6 +458,8 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
}
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{
// TODO optimize it for NEON

View File

@@ -28,11 +28,42 @@ namespace internal {
#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#endif
// FIXME NEON has 16 quad registers, but since the current register allocator
// is so bad, it is much better to reduce it to 8
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#if EIGEN_ARCH_ARM64
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
#else
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
#endif
#endif
#if EIGEN_COMP_MSVC
// In MSVC's arm_neon.h header file, all NEON vector types
// are aliases to the same underlying type __n128.
// We thus have to wrap them to make them different C++ types.
// (See also bug 1428)
template<typename T,int unique_id>
struct eigen_packet_wrapper
{
operator T&() { return m_val; }
operator const T&() const { return m_val; }
eigen_packet_wrapper() {}
eigen_packet_wrapper(const T &v) : m_val(v) {}
eigen_packet_wrapper& operator=(const T &v) {
m_val = v;
return *this;
}
T m_val;
};
typedef eigen_packet_wrapper<float32x2_t,0> Packet2f;
typedef eigen_packet_wrapper<float32x4_t,1> Packet4f;
typedef eigen_packet_wrapper<int32x4_t ,2> Packet4i;
typedef eigen_packet_wrapper<int32x2_t ,3> Packet2i;
typedef eigen_packet_wrapper<uint32x4_t ,4> Packet4ui;
#else
typedef float32x2_t Packet2f;
typedef float32x4_t Packet4f;
@@ -40,23 +71,28 @@ typedef int32x4_t Packet4i;
typedef int32x2_t Packet2i;
typedef uint32x4_t Packet4ui;
#endif // EIGEN_COMP_MSVC
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
const Packet4f p4f_##NAME = pset1<Packet4f>(X)
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
const Packet4i p4i_##NAME = pset1<Packet4i>(X)
// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
// which available on LLVM and GCC (at least)
#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
#if EIGEN_ARCH_ARM64
// __builtin_prefetch tends to do nothing on ARM64 compilers because the
// prefetch instructions there are too detailed for __builtin_prefetch to map
// meaningfully to them.
#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
#define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
#elif defined __pld
#define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
#elif !EIGEN_ARCH_ARM64
#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( " pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
#elif EIGEN_ARCH_ARM32
#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
#else
// by default no explicit prefetching
#define EIGEN_ARM_PREFETCH(ADDR)
@@ -81,7 +117,7 @@ template<> struct packet_traits<float> : default_packet_traits
HasSqrt = 0
};
};
template<> struct packet_traits<int> : default_packet_traits
template<> struct packet_traits<int32_t> : default_packet_traits
{
typedef Packet4i type;
typedef Packet4i half; // Packet2i intrinsics not implemented yet
@@ -103,19 +139,19 @@ EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q
EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
#endif
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
template<> struct unpacket_traits<Packet4i> { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return vdupq_n_s32(from); }
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) { return vdupq_n_s32(from); }
template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
{
const float32_t f[] = {0, 1, 2, 3};
const float f[] = {0, 1, 2, 3};
Packet4f countdown = vld1q_f32(f);
return vaddq_f32(pset1<Packet4f>(a), countdown);
}
template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)
template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a)
{
const int32_t i[] = {0, 1, 2, 3};
Packet4i countdown = vld1q_s32(i);
@@ -238,20 +274,20 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, con
}
template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
{
float32x2_t lo, hi;
lo = vld1_dup_f32(from);
hi = vld1_dup_f32(from+1);
return vcombine_f32(lo, hi);
}
template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from)
{
int32x2_t lo, hi;
lo = vld1_dup_s32(from);
@@ -259,11 +295,11 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
return vcombine_s32(lo, hi);
}
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<float> (float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
template<> EIGEN_STRONG_INLINE void pstoreu<float> (float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
{
@@ -274,7 +310,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa
res = vsetq_lane_f32(from[3*stride], res, 3);
return res;
}
template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
{
Packet4i res = pset1<Packet4i>(0);
res = vsetq_lane_s32(from[0*stride], res, 0);
@@ -291,7 +327,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, co
to[stride*2] = vgetq_lane_f32(from, 2);
to[stride*3] = vgetq_lane_f32(from, 3);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
template<> EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
{
to[stride*0] = vgetq_lane_s32(from, 0);
to[stride*1] = vgetq_lane_s32(from, 1);
@@ -299,12 +335,12 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
to[stride*3] = vgetq_lane_s32(from, 3);
}
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_ARM_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE void prefetch<float> (const float* addr) { EIGEN_ARM_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
// FIXME only store the 2 first elements ?
template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
float32x2_t a_lo, a_hi;
@@ -359,7 +395,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
return sum;
}
template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a)
{
int32x2_t a_lo, a_hi, sum;
@@ -406,7 +442,7 @@ template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
return vget_lane_f32(prod, 0);
}
template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
{
int32x2_t a_lo, a_hi, prod;
@@ -434,7 +470,7 @@ template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
return vget_lane_f32(min, 0);
}
template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
{
int32x2_t a_lo, a_hi, min;
@@ -459,7 +495,7 @@ template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
return vget_lane_f32(max, 0);
}
template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
{
int32x2_t a_lo, a_hi, max;

View File

@@ -128,7 +128,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
}
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
{
@@ -229,23 +229,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
}
};
template<> struct conj_helper<Packet4f, Packet2cf, false,false>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
{ return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
};
template<> struct conj_helper<Packet2cf, Packet4f, false,false>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
{ return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
@@ -340,7 +324,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
{
@@ -430,23 +414,7 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
}
};
template<> struct conj_helper<Packet2d, Packet1cd, false,false>
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
{ return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
};
template<> struct conj_helper<Packet1cd, Packet2d, false,false>
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
{ return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{
@@ -476,6 +444,26 @@ template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, co
return Packet2cf(_mm_castpd_ps(result));
}
template<> EIGEN_STRONG_INLINE Packet2cf pinsertfirst(const Packet2cf& a, std::complex<float> b)
{
return Packet2cf(_mm_loadl_pi(a.v, reinterpret_cast<const __m64*>(&b)));
}
template<> EIGEN_STRONG_INLINE Packet1cd pinsertfirst(const Packet1cd&, std::complex<double> b)
{
return pset1<Packet1cd>(b);
}
template<> EIGEN_STRONG_INLINE Packet2cf pinsertlast(const Packet2cf& a, std::complex<float> b)
{
return Packet2cf(_mm_loadh_pi(a.v, reinterpret_cast<const __m64*>(&b)));
}
template<> EIGEN_STRONG_INLINE Packet1cd pinsertlast(const Packet1cd&, std::complex<double> b)
{
return pset1<Packet1cd>(b);
}
} // end namespace internal
} // end namespace Eigen

View File

@@ -32,7 +32,7 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
/* the smallest non denormalized float number */
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000);
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000);//-1.f/0.f);
/* natural logarithm computed for 4 simultaneous float
return NaN for x <= 0
*/
@@ -444,25 +444,33 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
#if EIGEN_FAST_MATH
// This is based on Quake3's fast inverse square root.
// Functions for sqrt.
// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
// exact solution. It does not handle +inf, or denormalized numbers correctly.
// The main advantage of this approach is not just speed, but also the fact that
// it can be inlined and pipelined with other computations, further reducing its
// effective latency. This is similar to Quake3's fast inverse square root.
// For detail see here: http://www.beyond3d.com/content/articles/8/
// It lacks 1 (or 2 bits in some rare cases) of precision, and does not handle negative, +inf, or denormalized numbers correctly.
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f psqrt<Packet4f>(const Packet4f& _x)
{
Packet4f half = pmul(_x, pset1<Packet4f>(.5f));
Packet4f denormal_mask = _mm_and_ps(
_mm_cmpge_ps(_x, _mm_setzero_ps()),
_mm_cmplt_ps(_x, pset1<Packet4f>((std::numeric_limits<float>::min)())));
/* select only the inverse sqrt of non-zero inputs */
Packet4f non_zero_mask = _mm_cmpge_ps(_x, pset1<Packet4f>((std::numeric_limits<float>::min)()));
Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x));
// Compute approximate reciprocal sqrt.
Packet4f x = _mm_rsqrt_ps(_x);
// Do a single step of Newton's iteration.
x = pmul(x, psub(pset1<Packet4f>(1.5f), pmul(half, pmul(x,x))));
return pmul(_x,x);
// Flush results for denormals to zero.
return _mm_andnot_ps(denormal_mask, pmul(_x,x));
}
#else
template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
#endif
@@ -491,7 +499,7 @@ Packet4f prsqrt<Packet4f>(const Packet4f& _x) {
Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps());
Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask);
Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan),
_mm_and_ps(zero_mask, p4f_inf));
_mm_and_ps(zero_mask, p4f_inf));
// Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five));

View File

@@ -28,7 +28,7 @@ namespace internal {
#endif
#endif
#if (defined EIGEN_VECTORIZE_AVX) && EIGEN_COMP_GNUC_STRICT && (__GXX_ABI_VERSION < 1004)
#if (defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)
// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
// have overloads for both types without linking error.
// One solution is to increase ABI version using -fabi-version=4 (or greater).
@@ -409,10 +409,16 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double&
pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
}
#if EIGEN_COMP_PGI
typedef const void * SsePrefetchPtrType;
#else
typedef const char * SsePrefetchPtrType;
#endif
#ifndef EIGEN_VECTORIZE_AVX
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
#endif
#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
@@ -504,30 +510,13 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
{
return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
}
template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
{
return _mm_hadd_pd(vecs[0], vecs[1]);
}
template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
{
Packet4f tmp0 = _mm_hadd_ps(a,a);
return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0));
}
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst<Packet2d>(_mm_hadd_pd(a, a)); }
#else
// SSE2 versions
template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
{
Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
}
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
{
return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
}
template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
{
Packet4f tmp0, tmp1, tmp2;
@@ -548,6 +537,29 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
}
#endif // SSE3
template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
{
// Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
// (from Nehalem to Haswell)
// #ifdef EIGEN_VECTORIZE_SSE3
// Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3));
// return pfirst<Packet4f>(_mm_hadd_ps(tmp, tmp));
// #else
Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
// #endif
}
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
{
// Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
// (from Nehalem to Haswell)
// #ifdef EIGEN_VECTORIZE_SSE3
// return pfirst<Packet2d>(_mm_hadd_pd(a, a));
// #else
return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
// #endif
}
#ifdef EIGEN_VECTORIZE_SSSE3
template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
@@ -818,6 +830,44 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons
#endif
}
template<> EIGEN_STRONG_INLINE Packet4f pinsertfirst(const Packet4f& a, float b)
{
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_blend_ps(a,pset1<Packet4f>(b),1);
#else
return _mm_move_ss(a, _mm_load_ss(&b));
#endif
}
template<> EIGEN_STRONG_INLINE Packet2d pinsertfirst(const Packet2d& a, double b)
{
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_blend_pd(a,pset1<Packet2d>(b),1);
#else
return _mm_move_sd(a, _mm_load_sd(&b));
#endif
}
template<> EIGEN_STRONG_INLINE Packet4f pinsertlast(const Packet4f& a, float b)
{
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_blend_ps(a,pset1<Packet4f>(b),(1<<3));
#else
const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x0,0x0,0x0,0xFFFFFFFF));
return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, pset1<Packet4f>(b)));
#endif
}
template<> EIGEN_STRONG_INLINE Packet2d pinsertlast(const Packet2d& a, double b)
{
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_blend_pd(a,pset1<Packet2d>(b),(1<<1));
#else
const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x0,0xFFFFFFFF,0xFFFFFFFF));
return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, pset1<Packet2d>(b)));
#endif
}
// Scalar path for pmadd with FMA to ensure consistency with vectorized path.
#ifdef __FMA__
template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
@@ -832,4 +882,14 @@ template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, co
} // end namespace Eigen
#if EIGEN_COMP_PGI
// PGI++ does not define the following intrinsics in C++ mode.
static inline __m128 _mm_castpd_ps (__m128d x) { return reinterpret_cast<__m128&>(x); }
static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
static inline __m128d _mm_castps_pd (__m128 x) { return reinterpret_cast<__m128d&>(x); }
static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); }
static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); }
static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
#endif
#endif // EIGEN_PACKET_MATH_SSE_H

View File

@@ -14,6 +14,7 @@ namespace Eigen {
namespace internal {
#ifndef EIGEN_VECTORIZE_AVX
template <>
struct type_casting_traits<float, int> {
enum {
@@ -23,11 +24,6 @@ struct type_casting_traits<float, int> {
};
};
template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
return _mm_cvttps_epi32(a);
}
template <>
struct type_casting_traits<int, float> {
enum {
@@ -37,11 +33,6 @@ struct type_casting_traits<int, float> {
};
};
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
return _mm_cvtepi32_ps(a);
}
template <>
struct type_casting_traits<double, float> {
enum {
@@ -51,10 +42,6 @@ struct type_casting_traits<double, float> {
};
};
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
}
template <>
struct type_casting_traits<float, double> {
enum {
@@ -63,6 +50,19 @@ struct type_casting_traits<float, double> {
TgtCoeffRatio = 2
};
};
#endif
template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
return _mm_cvttps_epi32(a);
}
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
return _mm_cvtepi32_ps(a);
}
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
}
template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
// Simply discard the second half of the input

View File

@@ -2,6 +2,7 @@
// for linear algebra.
//
// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
@@ -24,13 +25,48 @@ struct Packet1cd
Packet2d v;
};
struct Packet2cf
{
EIGEN_STRONG_INLINE Packet2cf() {}
EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
union {
Packet4f v;
Packet1cd cd[2];
};
};
template<> struct packet_traits<std::complex<float> > : default_packet_traits
{
typedef Packet2cf type;
typedef Packet2cf half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 2,
HasHalfPacket = 0,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasBlend = 1,
HasSetLinear = 0
};
};
template<> struct packet_traits<std::complex<double> > : default_packet_traits
{
typedef Packet1cd type;
typedef Packet1cd half;
enum {
Vectorizable = 1,
AlignedOnScalar = 0,
AlignedOnScalar = 1,
size = 1,
HasHalfPacket = 0,
@@ -47,20 +83,68 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
};
};
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
/* Forward declaration */
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel);
template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
{
Packet2cf res;
res.cd[0] = Packet1cd(vec_ld2f((const float *)&from));
res.cd[1] = res.cd[0];
return res;
}
template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
{
std::complex<float> EIGEN_ALIGN16 af[2];
af[0] = from[0*stride];
af[1] = from[1*stride];
return pload<Packet2cf>(af);
}
template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride EIGEN_UNUSED)
{
return pload<Packet1cd>(from);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
{
std::complex<float> EIGEN_ALIGN16 af[2];
pstore<std::complex<float> >((std::complex<float> *) af, from);
to[0*stride] = af[0];
to[1*stride] = af[1];
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride EIGEN_UNUSED)
{
pstore<std::complex<double> >(to, from);
}
template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v, b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v, b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); }
template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); }
template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
{
Packet2cf res;
res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0]))).v;
res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1]))).v;
return res;
}
template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{
@@ -79,43 +163,90 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
return Packet1cd(v1 + v2);
}
template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from)
template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
return pset1<Packet1cd>(*from);
Packet2cf res;
res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[0]))).v;
res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[1]))).v;
return res;
}
template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
{
std::complex<double> EIGEN_ALIGN16 res[2];
pstore<std::complex<double> >(res, a);
std::complex<double> EIGEN_ALIGN16 res;
pstore<std::complex<double> >(&res, a);
return res;
}
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
{
std::complex<float> EIGEN_ALIGN16 res[2];
pstore<std::complex<float> >(res, a);
return res[0];
}
template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
{
Packet2cf res;
res.cd[0] = a.cd[1];
res.cd[1] = a.cd[0];
return res;
}
template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
{
return pfirst(a);
}
template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
{
std::complex<float> res;
Packet1cd b = padd<Packet1cd>(a.cd[0], a.cd[1]);
vec_st2f(b.v, (float*)&res);
return res;
}
template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
{
return vecs[0];
}
template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
{
PacketBlock<Packet2cf,2> transpose;
transpose.packet[0] = vecs[0];
transpose.packet[1] = vecs[1];
ptranspose(transpose);
return padd<Packet2cf>(transpose.packet[0], transpose.packet[1]);
}
template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
{
return pfirst(a);
}
template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
{
std::complex<float> res;
Packet1cd b = pmul<Packet1cd>(a.cd[0], a.cd[1]);
vec_st2f(b.v, (float*)&res);
return res;
}
template<int Offset>
struct palign_impl<Offset,Packet1cd>
@@ -127,6 +258,18 @@ struct palign_impl<Offset,Packet1cd>
}
};
template<int Offset>
struct palign_impl<Offset,Packet2cf>
{
static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
{
if (Offset == 1) {
first.cd[0] = first.cd[1];
first.cd[1] = second.cd[0];
}
}
};
template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
@@ -160,6 +303,42 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
}
};
template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
{
return internal::pmul(a, pconj(b));
}
};
template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
{
return internal::pmul(pconj(a), b);
}
};
template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
{
return pconj(internal::pmul(a, b));
}
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{
// TODO optimize it for AltiVec
@@ -168,17 +347,49 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, con
return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64)));
}
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
// TODO optimize it for AltiVec
Packet2cf res;
res.cd[0] = pdiv<Packet1cd>(a.cd[0], b.cd[0]);
res.cd[1] = pdiv<Packet1cd>(a.cd[1], b.cd[1]);
return res;
}
EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
{
return Packet1cd(preverse(Packet2d(x.v)));
}
EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
{
Packet2cf res;
res.cd[0] = pcplxflip(x.cd[0]);
res.cd[1] = pcplxflip(x.cd[1]);
return res;
}
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
{
Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
kernel.packet[0].v = tmp;
}
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
{
Packet1cd tmp = kernel.packet[0].cd[1];
kernel.packet[0].cd[1] = kernel.packet[1].cd[0];
kernel.packet[1].cd[0] = tmp;
}
template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
Packet2cf result;
const Selector<4> ifPacket4 = { ifPacket.select[0], ifPacket.select[0], ifPacket.select[1], ifPacket.select[1] };
result.v = pblend<Packet4f>(ifPacket4, thenPacket.v, elsePacket.v);
return result;
}
} // end namespace internal
} // end namespace Eigen

View File

@@ -3,6 +3,7 @@
//
// Copyright (C) 2007 Julien Pommier
// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
@@ -19,32 +20,32 @@ namespace Eigen {
namespace internal {
static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437);
static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet2d pexp<Packet2d>(const Packet2d& _x)
{
Packet2d x = _x;
_EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
_EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
_EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
_EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437);
_EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
_EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
Packet2d tmp, fx;
Packet2l emm0;
@@ -91,18 +92,44 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
isnumber_mask);
}
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f pexp<Packet4f>(const Packet4f& x)
{
Packet4f res;
res.v4f[0] = pexp<Packet2d>(x.v4f[0]);
res.v4f[1] = pexp<Packet2d>(x.v4f[1]);
return res;
}
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet2d psqrt<Packet2d>(const Packet2d& x)
{
return __builtin_s390_vfsqdb(x);
}
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f psqrt<Packet4f>(const Packet4f& x)
{
Packet4f res;
res.v4f[0] = psqrt<Packet2d>(x.v4f[0]);
res.v4f[1] = psqrt<Packet2d>(x.v4f[1]);
return res;
}
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet2d prsqrt<Packet2d>(const Packet2d& x) {
// Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation.
return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
}
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f prsqrt<Packet4f>(const Packet4f& x) {
Packet4f res;
res.v4f[0] = prsqrt<Packet2d>(x.v4f[0]);
res.v4f[1] = prsqrt<Packet2d>(x.v4f[1]);
return res;
}
} // end namespace internal
} // end namespace Eigen

View File

@@ -28,9 +28,8 @@ namespace internal {
#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#endif
// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
#endif
typedef __vector int Packet4i;
@@ -42,6 +41,10 @@ typedef __vector double Packet2d;
typedef __vector unsigned long long Packet2ul;
typedef __vector long long Packet2l;
typedef struct {
Packet2d v4f[2];
} Packet4f;
typedef union {
int32_t i[4];
uint32_t ui[4];
@@ -88,6 +91,7 @@ static Packet2d p2d_ONE = { 1.0, 1.0 };
static Packet2d p2d_ZERO_ = { -0.0, -0.0 };
static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8));
static Packet16uc p16uc_PSET64_HI = { 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
@@ -96,7 +100,7 @@ static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
// Mask alignment
#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
#define _EIGEN_ALIGNED_PTR(x) ((ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
// Handle endianness properly while loading constants
// Define global static constants:
@@ -132,13 +136,11 @@ template<> struct packet_traits<int> : default_packet_traits
typedef Packet4i type;
typedef Packet4i half;
enum {
// FIXME check the Has*
Vectorizable = 1,
AlignedOnScalar = 1,
size = 4,
HasHalfPacket = 0,
// FIXME check the Has*
HasAdd = 1,
HasSub = 1,
HasMul = 1,
@@ -147,6 +149,37 @@ template<> struct packet_traits<int> : default_packet_traits
};
};
template<> struct packet_traits<float> : default_packet_traits
{
typedef Packet4f type;
typedef Packet4f half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=4,
HasHalfPacket = 0,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasMin = 1,
HasMax = 1,
HasAbs = 1,
HasSin = 0,
HasCos = 0,
HasLog = 0,
HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasRound = 1,
HasFloor = 1,
HasCeil = 1,
HasNegate = 1,
HasBlend = 1
};
};
template<> struct packet_traits<double> : default_packet_traits
{
typedef Packet2d type;
@@ -157,7 +190,6 @@ template<> struct packet_traits<double> : default_packet_traits
size=2,
HasHalfPacket = 1,
// FIXME check the Has*
HasAdd = 1,
HasSub = 1,
HasMul = 1,
@@ -180,8 +212,12 @@ template<> struct packet_traits<double> : default_packet_traits
};
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
/* Forward declaration */
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel);
inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
{
Packet vt;
@@ -222,6 +258,32 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
return s;
}
/* Helper function to simulate a vec_splat_packet4f
*/
template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from)
{
Packet4f splat;
switch (element) {
case 0:
splat.v4f[0] = vec_splat(from.v4f[0], 0);
splat.v4f[1] = splat.v4f[0];
break;
case 1:
splat.v4f[0] = vec_splat(from.v4f[0], 1);
splat.v4f[1] = splat.v4f[0];
break;
case 2:
splat.v4f[0] = vec_splat(from.v4f[1], 0);
splat.v4f[1] = splat.v4f[0];
break;
case 3:
splat.v4f[0] = vec_splat(from.v4f[1], 1);
splat.v4f[1] = splat.v4f[0];
break;
}
return splat;
}
template<int Offset>
struct palign_impl<Offset,Packet4i>
{
@@ -238,6 +300,31 @@ struct palign_impl<Offset,Packet4i>
}
};
/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double
*/
template<int Offset>
struct palign_impl<Offset,Packet4f>
{
static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
{
switch (Offset % 4) {
case 1:
first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8);
first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8);
break;
case 2:
first.v4f[0] = first.v4f[1];
first.v4f[1] = second.v4f[0];
break;
case 3:
first.v4f[0] = vec_sld(first.v4f[1], second.v4f[0], 8);
first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8);
break;
}
}
};
template<int Offset>
struct palign_impl<Offset,Packet2d>
{
@@ -257,6 +344,16 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from)
return vfrom->v4i;
}
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
{
// FIXME: No intrinsic yet
EIGEN_DEBUG_ALIGNED_LOAD
Packet4f vfrom;
vfrom.v4f[0] = vec_ld2f(&from[0]);
vfrom.v4f[1] = vec_ld2f(&from[2]);
return vfrom;
}
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
{
// FIXME: No intrinsic yet
@@ -275,6 +372,15 @@ template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& f
vto->v4i = from;
}
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
{
// FIXME: No intrinsic yet
EIGEN_DEBUG_ALIGNED_STORE
vec_st2f(from.v4f[0], &to[0]);
vec_st2f(from.v4f[1], &to[2]);
}
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
{
// FIXME: No intrinsic yet
@@ -288,10 +394,16 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from)
{
return vec_splats(from);
}
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
return vec_splats(from);
}
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
{
Packet4f to;
to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
to.v4f[1] = to.v4f[0];
return to;
}
template<> EIGEN_STRONG_INLINE void
pbroadcast4<Packet4i>(const int *a,
@@ -304,6 +416,17 @@ pbroadcast4<Packet4i>(const int *a,
a3 = vec_splat(a3, 3);
}
template<> EIGEN_STRONG_INLINE void
pbroadcast4<Packet4f>(const float *a,
Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
{
a3 = pload<Packet4f>(a);
a0 = vec_splat_packet4f<0>(a3);
a1 = vec_splat_packet4f<1>(a3);
a2 = vec_splat_packet4f<2>(a3);
a3 = vec_splat_packet4f<3>(a3);
}
template<> EIGEN_STRONG_INLINE void
pbroadcast4<Packet2d>(const double *a,
Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
@@ -326,6 +449,16 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* f
return pload<Packet4i>(ai);
}
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
{
float EIGEN_ALIGN16 ai[4];
ai[0] = from[0*stride];
ai[1] = from[1*stride];
ai[2] = from[2*stride];
ai[3] = from[3*stride];
return pload<Packet4f>(ai);
}
template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
{
double EIGEN_ALIGN16 af[2];
@@ -344,6 +477,16 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
to[3*stride] = ai[3];
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
{
float EIGEN_ALIGN16 ai[4];
pstore<float>((float *)ai, from);
to[0*stride] = ai[0];
to[1*stride] = ai[1];
to[2*stride] = ai[2];
to[3*stride] = ai[3];
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
{
double EIGEN_ALIGN16 af[2];
@@ -353,52 +496,160 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to,
}
template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a + b); }
template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b)
{
Packet4f c;
c.v4f[0] = a.v4f[0] + b.v4f[0];
c.v4f[1] = a.v4f[1] + b.v4f[1];
return c;
}
template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a + b); }
template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a - b); }
template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b)
{
Packet4f c;
c.v4f[0] = a.v4f[0] - b.v4f[0];
c.v4f[1] = a.v4f[1] - b.v4f[1];
return c;
}
template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a - b); }
template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a * b); }
template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b)
{
Packet4f c;
c.v4f[0] = a.v4f[0] * b.v4f[0];
c.v4f[1] = a.v4f[1] * b.v4f[1];
return c;
}
template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a * b); }
template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a / b); }
template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
{
Packet4f c;
c.v4f[0] = a.v4f[0] / b.v4f[0];
c.v4f[1] = a.v4f[1] / b.v4f[1];
return c;
}
template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a / b); }
template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); }
template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
{
Packet4f c;
c.v4f[0] = -a.v4f[0];
c.v4f[1] = -a.v4f[1];
return c;
}
template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); }
template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd<Packet4i>(pmul<Packet4i>(a, b), c); }
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
{
Packet4f res;
res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
return res;
}
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN); }
template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN); }
template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
{
Packet4f res;
res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
return res;
}
template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
{
Packet4f res;
res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
return res;
}
template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
{
Packet4f res;
res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
return res;
}
template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
{
Packet4f res;
res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
return res;
}
template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
{
Packet4f res;
res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
return res;
}
template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return pand<Packet4i>(a, vec_nor(b, b)); }
template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
{
Packet4f res;
res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
return res;
}
template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
{
Packet4f res;
res.v4f[0] = vec_round(a.v4f[0]);
res.v4f[1] = vec_round(a.v4f[1]);
return res;
}
template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
{
Packet4f res;
res.v4f[0] = vec_ceil(a.v4f[0]);
res.v4f[1] = vec_ceil(a.v4f[1]);
return res;
}
template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); }
template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
{
Packet4f res;
res.v4f[0] = vec_floor(a.v4f[0]);
res.v4f[1] = vec_floor(a.v4f[1]);
return res;
}
template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { return pload<Packet4i>(from); }
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { return pload<Packet4f>(from); }
template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { return pload<Packet2d>(from); }
@@ -408,6 +659,14 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
}
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
{
Packet4f p = pload<Packet4f>(from);
p.v4f[1] = vec_splat(p.v4f[0], 1);
p.v4f[0] = vec_splat(p.v4f[0], 0);
return p;
}
template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
{
Packet2d p = pload<Packet2d>(from);
@@ -415,12 +674,15 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
}
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { pstore<int>(to, from); }
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { pstore<float>(to, from); }
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { pstore<double>(to, from); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
@@ -433,8 +695,23 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
}
template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
{
Packet4f rev;
rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
return rev;
}
template<> EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) { return vec_abs(a); }
template<> EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) { return vec_abs(a); }
template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
{
Packet4f res;
res.v4f[0] = pabs(a.v4f[0]);
res.v4f[1] = pabs(a.v4f[1]);
return res;
}
template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
{
@@ -453,6 +730,13 @@ template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
sum = padd<Packet2d>(a, b);
return pfirst(sum);
}
template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
{
Packet2d sum;
sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
double first = predux<Packet2d>(sum);
return static_cast<float>(first);
}
template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
{
@@ -493,6 +777,21 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
return sum;
}
template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
{
PacketBlock<Packet4f,4> transpose;
transpose.packet[0] = vecs[0];
transpose.packet[1] = vecs[1];
transpose.packet[2] = vecs[2];
transpose.packet[3] = vecs[3];
ptranspose(transpose);
Packet4f sum = padd(transpose.packet[0], transpose.packet[1]);
sum = padd(sum, transpose.packet[2]);
sum = padd(sum, transpose.packet[3]);
return sum;
}
// Other reduction functions:
// mul
template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
@@ -507,6 +806,12 @@ template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
}
template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
{
// Return predux_mul<Packet2d> of the subvectors product
return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
}
// min
template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
{
@@ -521,6 +826,14 @@ template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
return pfirst(pmin<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
}
template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
{
Packet2d b, res;
b = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
res = pmin<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
return static_cast<float>(pfirst(res));
}
// max
template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
{
@@ -536,6 +849,14 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
return pfirst(pmax<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
}
template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
{
Packet2d b, res;
b = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
res = pmax<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
return static_cast<float>(pfirst(res));
}
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet4i,4>& kernel) {
Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
@@ -556,12 +877,61 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
kernel.packet[1] = t1;
}
/* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one
*/
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet4f,4>& kernel) {
PacketBlock<Packet2d,2> t0,t1,t2,t3;
// copy top-left 2x2 Packet2d block
t0.packet[0] = kernel.packet[0].v4f[0];
t0.packet[1] = kernel.packet[1].v4f[0];
// copy top-right 2x2 Packet2d block
t1.packet[0] = kernel.packet[0].v4f[1];
t1.packet[1] = kernel.packet[1].v4f[1];
// copy bottom-left 2x2 Packet2d block
t2.packet[0] = kernel.packet[2].v4f[0];
t2.packet[1] = kernel.packet[3].v4f[0];
// copy bottom-right 2x2 Packet2d block
t3.packet[0] = kernel.packet[2].v4f[1];
t3.packet[1] = kernel.packet[3].v4f[1];
// Transpose all 2x2 blocks
ptranspose(t0);
ptranspose(t1);
ptranspose(t2);
ptranspose(t3);
// Copy back transposed blocks, but exchange t1 and t2 due to transposition
kernel.packet[0].v4f[0] = t0.packet[0];
kernel.packet[0].v4f[1] = t2.packet[0];
kernel.packet[1].v4f[0] = t0.packet[1];
kernel.packet[1].v4f[1] = t2.packet[1];
kernel.packet[2].v4f[0] = t1.packet[0];
kernel.packet[2].v4f[1] = t3.packet[0];
kernel.packet[3].v4f[0] = t1.packet[1];
kernel.packet[3].v4f[1] = t3.packet[1];
}
template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
return vec_sel(elsePacket, thenPacket, mask);
}
template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] };
Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] };
Packet2ul mask_hi = vec_cmpeq(select_hi, reinterpret_cast<Packet2ul>(p2l_ONE));
Packet2ul mask_lo = vec_cmpeq(select_lo, reinterpret_cast<Packet2ul>(p2l_ONE));
Packet4f result;
result.v4f[0] = vec_sel(elsePacket.v4f[0], thenPacket.v4f[0], mask_hi);
result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo);
return result;
}
template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));

View File

@@ -28,7 +28,7 @@ template<typename DstScalar,typename SrcScalar> struct assign_op {
{ internal::pstoret<DstScalar,Packet,Alignment>(a,b); }
};
// Empty overload for void type (used by PermutationMatrix
// Empty overload for void type (used by PermutationMatrix)
template<typename DstScalar> struct assign_op<DstScalar,void> {};
template<typename DstScalar,typename SrcScalar>

View File

@@ -255,7 +255,7 @@ struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_NEQ> : binary_op_base<LhsScalar,Rh
/** \internal
* \brief Template functor to compute the hypot of two scalars
* \brief Template functor to compute the hypot of two \b positive \b and \b real scalars
*
* \sa MatrixBase::stableNorm(), class Redux
*/
@@ -263,22 +263,15 @@ template<typename Scalar>
struct scalar_hypot_op<Scalar,Scalar> : binary_op_base<Scalar,Scalar>
{
EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
// typedef typename NumTraits<Scalar>::Real result_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar &x, const Scalar &y) const
{
using std::sqrt;
Scalar p, qp;
if(_x>_y)
{
p = _x;
qp = _y / p;
}
else
{
p = _y;
qp = _x / p;
}
return p * sqrt(Scalar(1) + qp*qp);
// This functor is used by hypotNorm only for which it is faster to first apply abs
// on all coefficients prior to reduction through hypot.
// This way we avoid calling abs on positive and real entries, and this also permits
// to seamlessly handle complexes. Otherwise we would have to handle both real and complexes
// through the same functor...
return internal::positive_real_hypot(x,y);
}
};
template<typename Scalar>

View File

@@ -1,7 +1,7 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
@@ -37,87 +37,77 @@ template<typename Scalar>
struct functor_traits<scalar_identity_op<Scalar> >
{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };
template <typename Scalar, typename Packet, bool RandomAccess, bool IsInteger> struct linspaced_op_impl;
template <typename Scalar, typename Packet, bool IsInteger> struct linspaced_op_impl;
// linear access for packet ops:
// 1) initialization
// base = [low, ..., low] + ([step, ..., step] * [-size, ..., 0])
// 2) each step (where size is 1 for coeff access or PacketSize for packet access)
// base += [size*step, ..., size*step]
//
// TODO: Perhaps it's better to initialize lazily (so not in the constructor but in packetOp)
// in order to avoid the padd() in operator() ?
template <typename Scalar, typename Packet>
struct linspaced_op_impl<Scalar,Packet,/*RandomAccess*/false,/*IsInteger*/false>
struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
{
linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
m_low(low), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)),
m_packetStep(pset1<Packet>(unpacket_traits<Packet>::size*m_step)),
m_base(padd(pset1<Packet>(low), pmul(pset1<Packet>(m_step),plset<Packet>(-unpacket_traits<Packet>::size)))) {}
m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)),
m_flip(numext::abs(high)<numext::abs(low))
{}
template<typename IndexType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const
{
m_base = padd(m_base, pset1<Packet>(m_step));
return m_low+Scalar(i)*m_step;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const {
typedef typename NumTraits<Scalar>::Real RealScalar;
if(m_flip)
return (i==0)? m_low : (m_high - RealScalar(m_size1-i)*m_step);
else
return (i==m_size1)? m_high : (m_low + RealScalar(i)*m_step);
}
template<typename IndexType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType) const { return m_base = padd(m_base,m_packetStep); }
const Scalar m_low;
const Scalar m_step;
const Packet m_packetStep;
mutable Packet m_base;
};
// random access for packet ops:
// 1) each step
// [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
template <typename Scalar, typename Packet>
struct linspaced_op_impl<Scalar,Packet,/*RandomAccess*/true,/*IsInteger*/false>
{
linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
m_low(low), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)),
m_lowPacket(pset1<Packet>(m_low)), m_stepPacket(pset1<Packet>(m_step)), m_interPacket(plset<Packet>(0)) {}
template<typename IndexType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { return m_low+i*m_step; }
template<typename IndexType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const
{ return internal::padd(m_lowPacket, pmul(m_stepPacket, padd(pset1<Packet>(Scalar(i)),m_interPacket))); }
{
// Principle:
// [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
if(m_flip)
{
Packet pi = plset<Packet>(Scalar(i-m_size1));
Packet res = padd(pset1<Packet>(m_high), pmul(pset1<Packet>(m_step), pi));
if(i==0)
res = pinsertfirst(res, m_low);
return res;
}
else
{
Packet pi = plset<Packet>(Scalar(i));
Packet res = padd(pset1<Packet>(m_low), pmul(pset1<Packet>(m_step), pi));
if(i==m_size1-unpacket_traits<Packet>::size+1)
res = pinsertlast(res, m_high);
return res;
}
}
const Scalar m_low;
const Scalar m_high;
const Index m_size1;
const Scalar m_step;
const Packet m_lowPacket;
const Packet m_stepPacket;
const Packet m_interPacket;
const bool m_flip;
};
template <typename Scalar, typename Packet>
struct linspaced_op_impl<Scalar,Packet,/*RandomAccess*/true,/*IsInteger*/true>
struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/true>
{
linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
m_low(low), m_length(high-low), m_divisor(convert_index<Scalar>(num_steps==1?1:num_steps-1)), m_interPacket(plset<Packet>(0))
m_low(low),
m_multiplier((high-low)/convert_index<Scalar>(num_steps<=1 ? 1 : num_steps-1)),
m_divisor(convert_index<Scalar>((high>=low?num_steps:-num_steps)+(high-low))/((numext::abs(high-low)+1)==0?1:(numext::abs(high-low)+1))),
m_use_divisor(num_steps>1 && (numext::abs(high-low)+1)<num_steps)
{}
template<typename IndexType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Scalar operator() (IndexType i) const {
return m_low + (m_length*Scalar(i))/m_divisor;
const Scalar operator() (IndexType i) const
{
if(m_use_divisor) return m_low + convert_index<Scalar>(i)/m_divisor;
else return m_low + convert_index<Scalar>(i)*m_multiplier;
}
template<typename IndexType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Packet packetOp(IndexType i) const {
return internal::padd(pset1<Packet>(m_low), pdiv(pmul(pset1<Packet>(m_length), padd(pset1<Packet>(Scalar(i)),m_interPacket)),
pset1<Packet>(m_divisor))); }
const Scalar m_low;
const Scalar m_length;
const Scalar m_divisor;
const Packet m_interPacket;
const Scalar m_multiplier;
const Scalar m_divisor;
const bool m_use_divisor;
};
// ----- Linspace functor ----------------------------------------------------------------
@@ -125,18 +115,18 @@ struct linspaced_op_impl<Scalar,Packet,/*RandomAccess*/true,/*IsInteger*/true>
// Forward declaration (we default to random access which does not really give
// us a speed gain when using packet access but it allows to use the functor in
// nested expressions).
template <typename Scalar, typename PacketType, bool RandomAccess = true> struct linspaced_op;
template <typename Scalar, typename PacketType, bool RandomAccess> struct functor_traits< linspaced_op<Scalar,PacketType,RandomAccess> >
template <typename Scalar, typename PacketType> struct linspaced_op;
template <typename Scalar, typename PacketType> struct functor_traits< linspaced_op<Scalar,PacketType> >
{
enum
{
Cost = 1,
PacketAccess = packet_traits<Scalar>::HasSetLinear
&& ((!NumTraits<Scalar>::IsInteger) || packet_traits<Scalar>::HasDiv),
PacketAccess = (!NumTraits<Scalar>::IsInteger) && packet_traits<Scalar>::HasSetLinear && packet_traits<Scalar>::HasBlend,
/*&& ((!NumTraits<Scalar>::IsInteger) || packet_traits<Scalar>::HasDiv),*/ // <- vectorization for integer is currently disabled
IsRepeatable = true
};
};
template <typename Scalar, typename PacketType, bool RandomAccess> struct linspaced_op
template <typename Scalar, typename PacketType> struct linspaced_op
{
linspaced_op(const Scalar& low, const Scalar& high, Index num_steps)
: impl((num_steps==1 ? high : low),high,num_steps)
@@ -148,20 +138,49 @@ template <typename Scalar, typename PacketType, bool RandomAccess> struct linspa
template<typename Packet,typename IndexType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.packetOp(i); }
// This proxy object handles the actual required temporaries, the different
// implementations (random vs. sequential access) as well as the
// correct piping to size 2/4 packet operations.
// As long as we don't have a Bresenham-like implementation for linear-access and integer types,
// we have to by-pass RandomAccess for integer types. See bug 698.
const linspaced_op_impl<Scalar,PacketType,(NumTraits<Scalar>::IsInteger?true:RandomAccess),NumTraits<Scalar>::IsInteger> impl;
// This proxy object handles the actual required temporaries and the different
// implementations (integer vs. floating point).
const linspaced_op_impl<Scalar,PacketType,NumTraits<Scalar>::IsInteger> impl;
};
// Linear access is automatically determined from the operator() prototypes available for the given functor.
// If it exposes an operator()(i,j), then we assume the i and j coefficients are required independently
// and linear access is not possible. In all other cases, linear access is enabled.
// Users should not have to deal with this struture.
// Users should not have to deal with this structure.
template<typename Functor> struct functor_has_linear_access { enum { ret = !has_binary_operator<Functor>::value }; };
// For unreliable compilers, let's specialize the has_*ary_operator
// helpers so that at least built-in nullary functors work fine.
#if !( (EIGEN_COMP_MSVC>1600) || (EIGEN_GNUC_AT_LEAST(4,8)) || (EIGEN_COMP_ICC>=1600))
template<typename Scalar,typename IndexType>
struct has_nullary_operator<scalar_constant_op<Scalar>,IndexType> { enum { value = 1}; };
template<typename Scalar,typename IndexType>
struct has_unary_operator<scalar_constant_op<Scalar>,IndexType> { enum { value = 0}; };
template<typename Scalar,typename IndexType>
struct has_binary_operator<scalar_constant_op<Scalar>,IndexType> { enum { value = 0}; };
template<typename Scalar,typename IndexType>
struct has_nullary_operator<scalar_identity_op<Scalar>,IndexType> { enum { value = 0}; };
template<typename Scalar,typename IndexType>
struct has_unary_operator<scalar_identity_op<Scalar>,IndexType> { enum { value = 0}; };
template<typename Scalar,typename IndexType>
struct has_binary_operator<scalar_identity_op<Scalar>,IndexType> { enum { value = 1}; };
template<typename Scalar, typename PacketType,typename IndexType>
struct has_nullary_operator<linspaced_op<Scalar,PacketType>,IndexType> { enum { value = 0}; };
template<typename Scalar, typename PacketType,typename IndexType>
struct has_unary_operator<linspaced_op<Scalar,PacketType>,IndexType> { enum { value = 1}; };
template<typename Scalar, typename PacketType,typename IndexType>
struct has_binary_operator<linspaced_op<Scalar,PacketType>,IndexType> { enum { value = 0}; };
template<typename Scalar,typename IndexType>
struct has_nullary_operator<scalar_random_op<Scalar>,IndexType> { enum { value = 1}; };
template<typename Scalar,typename IndexType>
struct has_unary_operator<scalar_random_op<Scalar>,IndexType> { enum { value = 0}; };
template<typename Scalar,typename IndexType>
struct has_binary_operator<scalar_random_op<Scalar>,IndexType> { enum { value = 0}; };
#endif
} // end namespace internal
} // end namespace Eigen

View File

@@ -72,7 +72,7 @@ template<typename T>
struct functor_traits<std::not_equal_to<T> >
{ enum { Cost = 1, PacketAccess = false }; };
#if(__cplusplus < 201103L)
#if (__cplusplus < 201103L) && (EIGEN_COMP_MSVC <= 1900)
// std::binder* are deprecated since c++11 and will be removed in c++17
template<typename T>
struct functor_traits<std::binder2nd<T> >
@@ -83,13 +83,17 @@ struct functor_traits<std::binder1st<T> >
{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
#endif
#if (__cplusplus < 201703L) && (EIGEN_COMP_MSVC < 1910)
// std::unary_negate is deprecated since c++17 and will be removed in c++20
template<typename T>
struct functor_traits<std::unary_negate<T> >
{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
// std::binary_negate is deprecated since c++17 and will be removed in c++20
template<typename T>
struct functor_traits<std::binary_negate<T> >
{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
#endif
#ifdef EIGEN_STDEXT_SUPPORT

View File

@@ -321,7 +321,7 @@ struct functor_traits<scalar_log1p_op<Scalar> > {
*/
template<typename Scalar> struct scalar_log10_op {
EIGEN_EMPTY_STRUCT_CTOR(scalar_log10_op)
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::log10; return log10(a); }
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { EIGEN_USING_STD_MATH(log10) return log10(a); }
template <typename Packet>
EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog10(a); }
};

View File

@@ -580,7 +580,7 @@ DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Pack
}
template<typename Packet>
const DoublePacket<Packet>& predux4(const DoublePacket<Packet> &a)
const DoublePacket<Packet>& predux_downto4(const DoublePacket<Packet> &a)
{
return a;
}
@@ -972,7 +972,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
internal::prefetch(blA+(3*K+16)*LhsProgress); \
if (EIGEN_ARCH_ARM) internal::prefetch(blB+(4*K+16)*RhsProgress); /* Bug 953 */ \
if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
@@ -1581,10 +1581,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
if(SwappedTraits::LhsProgress==8)
{
// Special case where we have to first reduce the accumulation register C0
typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
@@ -1596,13 +1596,13 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
SRhsPacketHalf b0;
straits.loadLhsUnaligned(blB, a0);
straits.loadRhs(blA, b0);
SAccPacketHalf c0 = predux4(C0);
SAccPacketHalf c0 = predux_downto4(C0);
straits.madd(a0,b0,c0,b0);
straits.acc(c0, alphav, R);
}
else
{
straits.acc(predux4(C0), alphav, R);
straits.acc(predux_downto4(C0), alphav, R);
}
res.scatterPacket(i, j2, R);
}

View File

@@ -10,7 +10,7 @@
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
#define EIGEN_GENERAL_MATRIX_MATRIX_H
namespace Eigen {
namespace Eigen {
namespace internal {
@@ -24,7 +24,7 @@ template<
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
{
typedef gebp_traits<RhsScalar,LhsScalar> Traits;
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
static EIGEN_STRONG_INLINE void run(
Index rows, Index cols, Index depth,
@@ -54,7 +54,7 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh
{
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
static void run(Index rows, Index cols, Index depth,
const LhsScalar* _lhs, Index lhsStride,
@@ -83,15 +83,15 @@ static void run(Index rows, Index cols, Index depth,
if(info)
{
// this is the parallel version!
Index tid = omp_get_thread_num();
Index threads = omp_get_num_threads();
int tid = omp_get_thread_num();
int threads = omp_get_num_threads();
LhsScalar* blockA = blocking.blockA();
eigen_internal_assert(blockA!=0);
std::size_t sizeB = kc*nc;
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0);
// For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
for(Index k=0; k<depth; k+=kc)
{
@@ -114,11 +114,11 @@ static void run(Index rows, Index cols, Index depth,
// Notify the other threads that the part A'_i is ready to go.
info[tid].sync = k;
// Computes C_i += A' * B' per A'_i
for(Index shift=0; shift<threads; ++shift)
for(int shift=0; shift<threads; ++shift)
{
Index i = (tid+shift)%threads;
int i = (tid+shift)%threads;
// At this point we have to make sure that A'_i has been updated by the thread i,
// we use testAndSetOrdered to mimic a volatile access.
@@ -161,7 +161,7 @@ static void run(Index rows, Index cols, Index depth,
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
const bool pack_rhs_once = mc!=rows && kc==depth && nc==cols;
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
@@ -172,24 +172,24 @@ static void run(Index rows, Index cols, Index depth,
for(Index k2=0; k2<depth; k2+=kc)
{
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
// OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
// => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
// Note that this panel will be read as many times as the number of blocks in the rhs's
// horizontal panel which is, in practice, a very low number.
pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc);
// For each kc x nc block of the rhs's horizontal panel...
for(Index j2=0; j2<cols; j2+=nc)
{
const Index actual_nc = (std::min)(j2+nc,cols)-j2;
// We pack the rhs's block into a sequential chunk of memory (L2 caching)
// Note that this block will be read a very high number of times, which is equal to the number of
// micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
if((!pack_rhs_once) || i2==0)
pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
// Everything is packed, we can now call the panel * block kernel:
gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
}
@@ -229,7 +229,7 @@ struct gemm_functor
(Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
m_actualAlpha, m_blocking, info);
}
typedef typename Gemm::Traits Traits;
protected:
@@ -313,7 +313,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
this->m_blockB = reinterpret_cast<RhsScalar*>((internal::UIntPtr(m_staticB) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
#endif
}
void initParallel(Index, Index, Index, Index)
{}
@@ -359,14 +359,14 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
m_sizeA = this->m_mc * this->m_kc;
m_sizeB = this->m_kc * this->m_nc;
}
void initParallel(Index rows, Index cols, Index depth, Index num_threads)
{
this->m_mc = Transpose ? cols : rows;
this->m_nc = Transpose ? rows : cols;
this->m_kc = depth;
eigen_internal_assert(this->m_blockA==0 && this->m_blockB==0);
eigen_internal_assert(this->m_blockA==0 && this->m_blockB==0);
Index m = this->m_mc;
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc, num_threads);
m_sizeA = this->m_mc * this->m_kc;
@@ -401,7 +401,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
} // end namespace internal
namespace internal {
template<typename Lhs, typename Rhs>
struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> >
@@ -409,21 +409,21 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
typedef typename Lhs::Scalar LhsScalar;
typedef typename Rhs::Scalar RhsScalar;
typedef internal::blas_traits<Lhs> LhsBlasTraits;
typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
typedef internal::blas_traits<Rhs> RhsBlasTraits;
typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
enum {
MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime)
};
typedef generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> lazyproduct;
template<typename Dst>
static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
@@ -453,7 +453,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
else
scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
}
template<typename Dest>
static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha)
{
@@ -481,7 +481,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), Dest::Flags&RowMajorBit);
(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(), Dest::Flags&RowMajorBit);
}
};

View File

@@ -148,7 +148,7 @@ struct tribb_kernel
ResMapper res(_res, resStride);
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer((internal::constructor_without_unaligned_array_assert()));
// let's process the block per panel of actual_mc x BlockSize,
// again, each is split into three parts, etc.
@@ -199,7 +199,7 @@ struct general_product_to_triangular_selector;
template<typename MatrixType, typename ProductType, int UpLo>
struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
{
static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha)
static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha, bool beta)
{
typedef typename MatrixType::Scalar Scalar;
@@ -217,6 +217,9 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
if(!beta)
mat.template triangularView<UpLo>().setZero();
enum {
StorageOrder = (internal::traits<MatrixType>::Flags&RowMajorBit) ? RowMajor : ColMajor,
UseLhsDirectly = _ActualLhs::InnerStrideAtCompileTime==1,
@@ -244,7 +247,7 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
template<typename MatrixType, typename ProductType, int UpLo>
struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
{
static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha)
static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha, bool beta)
{
typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
typedef internal::blas_traits<Lhs> LhsBlasTraits;
@@ -260,13 +263,19 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
typename ProductType::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
if(!beta)
mat.template triangularView<UpLo>().setZero();
enum {
IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0,
LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0,
RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0
RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0,
SkipDiag = (UpLo&(UnitDiag|ZeroDiag))!=0
};
Index size = mat.cols();
if(SkipDiag)
size--;
Index depth = actualLhs.cols();
typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,typename Lhs::Scalar,typename Rhs::Scalar,
@@ -277,20 +286,22 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
internal::general_matrix_matrix_triangular_product<Index,
typename Lhs::Scalar, LhsIsRowMajor ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
typename Rhs::Scalar, RhsIsRowMajor ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
IsRowMajor ? RowMajor : ColMajor, UpLo>
IsRowMajor ? RowMajor : ColMajor, UpLo&(Lower|Upper)>
::run(size, depth,
&actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(),
mat.data(), mat.outerStride(), actualAlpha, blocking);
&actualLhs.coeffRef(SkipDiag&&(UpLo&Lower)==Lower ? 1 : 0,0), actualLhs.outerStride(),
&actualRhs.coeffRef(0,SkipDiag&&(UpLo&Upper)==Upper ? 1 : 0), actualRhs.outerStride(),
mat.data() + (SkipDiag ? (bool(IsRowMajor) != ((UpLo&Lower)==Lower) ? 1 : mat.outerStride() ) : 0), mat.outerStride(), actualAlpha, blocking);
}
};
template<typename MatrixType, unsigned int UpLo>
template<typename ProductType>
TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha)
TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
{
EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED);
eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols());
general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha);
general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta);
return derived();
}

View File

@@ -33,7 +33,7 @@
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H
#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H
namespace Eigen {
namespace Eigen {
namespace internal {
@@ -52,7 +52,7 @@ struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,Con
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \
{ \
if (lhs==rhs) { \
if ( lhs==rhs && ((UpLo&(Lower|Upper)==UpLo)) ) { \
general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \
} else { \
@@ -86,9 +86,9 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
/* typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;*/ \
\
BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'T':'N'; \
EIGTYPE beta; \
BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \
char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \
EIGTYPE beta(1); \
BLASFUNC(&uplo, &trans, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), lhs, &lda, (const BLASTYPE*)&numext::real_ref(beta), res, &ldc); \
} \
};
@@ -107,7 +107,7 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
typedef Matrix<EIGTYPE, Dynamic, Dynamic, AStorageOrder> MatrixType; \
\
BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'C':'N'; \
char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'C':'N'); \
RTYPE alpha_, beta_; \
const EIGTYPE* a_ptr; \
\
@@ -125,9 +125,13 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
} \
};
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk)
EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk)
#else
EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_)
EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk_)
#endif
// TODO hanlde complex cases
// EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_)

View File

@@ -46,7 +46,7 @@ namespace internal {
// gemm specialization
#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASPREFIX) \
#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASFUNC) \
template< \
typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
@@ -100,13 +100,20 @@ static void run(Index rows, Index cols, Index depth, \
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
} else b = _rhs; \
\
BLASPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
}};
GEMM_SPECIALIZATION(double, d, double, d)
GEMM_SPECIALIZATION(float, f, float, s)
GEMM_SPECIALIZATION(dcomplex, cd, double, z)
GEMM_SPECIALIZATION(scomplex, cf, float, c)
#ifdef EIGEN_USE_MKL
GEMM_SPECIALIZATION(double, d, double, dgemm)
GEMM_SPECIALIZATION(float, f, float, sgemm)
GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm)
GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm)
#else
GEMM_SPECIALIZATION(double, d, double, dgemm_)
GEMM_SPECIALIZATION(float, f, float, sgemm_)
GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_)
#endif
} // end namespase internal

View File

@@ -183,8 +183,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C
alignmentPattern = AllAligned;
}
const Index offset1 = (FirstAligned && alignmentStep==1)?3:1;
const Index offset3 = (FirstAligned && alignmentStep==1)?1:3;
const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
@@ -457,8 +457,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,R
alignmentPattern = AllAligned;
}
const Index offset1 = (FirstAligned && alignmentStep==1)?3:1;
const Index offset3 = (FirstAligned && alignmentStep==1)?1:3;
const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)

View File

@@ -85,7 +85,7 @@ EIGEN_BLAS_GEMV_SPECIALIZE(float)
EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex)
EIGEN_BLAS_GEMV_SPECIALIZE(scomplex)
#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASPREFIX) \
#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \
template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \
struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \
{ \
@@ -113,14 +113,21 @@ static void run( \
x_ptr=x_tmp.data(); \
incx=1; \
} else x_ptr=rhs; \
BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \
BLASFUNC(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
}\
};
EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, d)
EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, s)
EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, z)
EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, c)
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv)
EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv)
EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, zgemv)
EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, MKL_Complex8 , cgemv)
#else
EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv_)
EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv_)
EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, zgemv_)
EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, cgemv_)
#endif
} // end namespase internal

View File

@@ -10,7 +10,7 @@
#ifndef EIGEN_PARALLELIZER_H
#define EIGEN_PARALLELIZER_H
namespace Eigen {
namespace Eigen {
namespace internal {
@@ -75,7 +75,7 @@ template<typename Index> struct GemmParallelInfo
{
GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
int volatile sync;
Index volatile sync;
int volatile users;
Index lhs_start;
@@ -83,7 +83,7 @@ template<typename Index> struct GemmParallelInfo
};
template<bool Condition, typename Functor, typename Index>
void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpose)
void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, bool transpose)
{
// TODO when EIGEN_USE_BLAS is defined,
// we should still enable OMP for other scalar types
@@ -92,6 +92,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
// the matrix product when multithreading is enabled. This is a temporary
// fix to support row-major destination matrices. This whole
// parallelizer mechanism has to be redisigned anyway.
EIGEN_UNUSED_VARIABLE(depth);
EIGEN_UNUSED_VARIABLE(transpose);
func(0,rows, 0,cols);
#else
@@ -103,9 +104,16 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
// - the sizes are large enough
// compute the maximal number of threads from the size of the product:
// FIXME this has to be fine tuned
// This first heuristic takes into account that the product kernel is fully optimized when working with nr columns at once.
Index size = transpose ? rows : cols;
Index pb_max_threads = std::max<Index>(1,size / 32);
Index pb_max_threads = std::max<Index>(1,size / Functor::Traits::nr);
// compute the maximal number of threads from the total amount of work:
double work = static_cast<double>(rows) * static_cast<double>(cols) *
static_cast<double>(depth);
double kMinTaskSize = 50000; // FIXME improve this heuristic.
pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, work / kMinTaskSize));
// compute the number of threads we are going to use
Index threads = std::min<Index>(nbThreads(), pb_max_threads);
@@ -120,19 +128,19 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
if(transpose)
std::swap(rows,cols);
ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0);
#pragma omp parallel num_threads(threads)
{
Index i = omp_get_thread_num();
// Note that the actual number of threads might be lower than the number of request ones.
Index actual_threads = omp_get_num_threads();
Index blockCols = (cols / actual_threads) & ~Index(0x3);
Index blockRows = (rows / actual_threads);
blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
Index r0 = i*blockRows;
Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows;

View File

@@ -40,7 +40,7 @@ namespace internal {
/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
@@ -81,13 +81,13 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
} else b = _rhs; \
\
BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
\
} \
};
#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
@@ -144,20 +144,26 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
} \
\
BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
\
} \
};
EIGEN_BLAS_SYMM_L(double, double, d, d)
EIGEN_BLAS_SYMM_L(float, float, f, s)
EIGEN_BLAS_HEMM_L(dcomplex, double, cd, z)
EIGEN_BLAS_HEMM_L(scomplex, float, cf, c)
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_SYMM_L(double, double, d, dsymm)
EIGEN_BLAS_SYMM_L(float, float, f, ssymm)
EIGEN_BLAS_HEMM_L(dcomplex, MKL_Complex16, cd, zhemm)
EIGEN_BLAS_HEMM_L(scomplex, MKL_Complex8, cf, chemm)
#else
EIGEN_BLAS_SYMM_L(double, double, d, dsymm_)
EIGEN_BLAS_SYMM_L(float, float, f, ssymm_)
EIGEN_BLAS_HEMM_L(dcomplex, double, cd, zhemm_)
EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_)
#endif
/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
@@ -197,13 +203,13 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
} else b = _lhs; \
\
BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
\
} \
};
#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
@@ -259,15 +265,21 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
} \
\
BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
} \
};
EIGEN_BLAS_SYMM_R(double, double, d, d)
EIGEN_BLAS_SYMM_R(float, float, f, s)
EIGEN_BLAS_HEMM_R(dcomplex, double, cd, z)
EIGEN_BLAS_HEMM_R(scomplex, float, cf, c)
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_SYMM_R(double, double, d, dsymm)
EIGEN_BLAS_SYMM_R(float, float, f, ssymm)
EIGEN_BLAS_HEMM_R(dcomplex, MKL_Complex16, cd, zhemm)
EIGEN_BLAS_HEMM_R(scomplex, MKL_Complex8, cf, chemm)
#else
EIGEN_BLAS_SYMM_R(double, double, d, dsymm_)
EIGEN_BLAS_SYMM_R(float, float, f, ssymm_)
EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_)
EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_)
#endif
} // end namespace internal
} // end namespace Eigen

View File

@@ -83,10 +83,10 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
Scalar t3(0);
Packet ptmp3 = pset1<Packet>(t3);
size_t starti = FirstTriangular ? 0 : j+2;
size_t endi = FirstTriangular ? j : size;
size_t alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti);
size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
Index starti = FirstTriangular ? 0 : j+2;
Index endi = FirstTriangular ? j : size;
Index alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti);
Index alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
res[j] += cjd.pmul(numext::real(A0[j]), t0);
res[j+1] += cjd.pmul(numext::real(A1[j+1]), t1);
@@ -101,7 +101,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
t2 += cj1.pmul(A0[j+1], rhs[j+1]);
}
for (size_t i=starti; i<alignedStart; ++i)
for (Index i=starti; i<alignedStart; ++i)
{
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
t2 += cj1.pmul(A0[i], rhs[i]);
@@ -113,7 +113,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize)
for (Index i=alignedStart; i<alignedEnd; i+=PacketSize)
{
Packet A0i = ploadu<Packet>(a0It); a0It += PacketSize;
Packet A1i = ploadu<Packet>(a1It); a1It += PacketSize;
@@ -125,7 +125,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3);
pstore(resIt,Xi); resIt += PacketSize;
}
for (size_t i=alignedEnd; i<endi; i++)
for (Index i=alignedEnd; i<endi; i++)
{
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
t2 += cj1.pmul(A0[i], rhs[i]);

View File

@@ -95,14 +95,21 @@ const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \
x_tmp=map_x.conjugate(); \
x_ptr=x_tmp.data(); \
} else x_ptr=_rhs; \
BLASFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \
BLASFUNC(&uplo, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
}\
};
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv)
EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv)
EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv)
EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, MKL_Complex8, chemv)
#else
EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_)
EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_)
EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_)
EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_)
#endif
} // end namespace internal

View File

@@ -137,7 +137,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
// To work around an "error: member reference base type 'Matrix<...>
// (Eigen::internal::constructor_without_unaligned_array_assert (*)())' is
// not a structure or union" compilation error in nvcc (tested V8.0.61),
// create a dummy internal::constructor_without_unaligned_array_assert
// object to pass to the Matrix constructor.
internal::constructor_without_unaligned_array_assert a;
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer(a);
triangularBuffer.setZero();
if((Mode&ZeroDiag)==ZeroDiag)
triangularBuffer.diagonal().setZero();
@@ -284,7 +290,8 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
internal::constructor_without_unaligned_array_assert a;
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer(a);
triangularBuffer.setZero();
if((Mode&ZeroDiag)==ZeroDiag)
triangularBuffer.diagonal().setZero();
@@ -393,7 +400,9 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
{
template<typename Dest> static void run(Dest& dst, const Lhs &a_lhs, const Rhs &a_rhs, const typename Dest::Scalar& alpha)
{
typedef typename Dest::Scalar Scalar;
typedef typename Lhs::Scalar LhsScalar;
typedef typename Rhs::Scalar RhsScalar;
typedef typename Dest::Scalar Scalar;
typedef internal::blas_traits<Lhs> LhsBlasTraits;
typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
@@ -405,8 +414,9 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
* RhsBlasTraits::extractScalarFactor(a_rhs);
LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(a_lhs);
RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(a_rhs);
Scalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType;
@@ -431,6 +441,21 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
&dst.coeffRef(0,0), dst.outerStride(), // result info
actualAlpha, blocking
);
// Apply correction if the diagonal is unit and a scalar factor was nested:
if ((Mode&UnitDiag)==UnitDiag)
{
if (LhsIsTriangular && lhs_alpha!=LhsScalar(1))
{
Index diagSize = (std::min)(lhs.rows(),lhs.cols());
dst.topRows(diagSize) -= ((lhs_alpha-LhsScalar(1))*a_rhs).topRows(diagSize);
}
else if ((!LhsIsTriangular) && rhs_alpha!=RhsScalar(1))
{
Index diagSize = (std::min)(rhs.rows(),rhs.cols());
dst.leftCols(diagSize) -= (rhs_alpha-RhsScalar(1))*a_lhs.leftCols(diagSize);
}
}
}
};

View File

@@ -75,7 +75,7 @@ EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true)
EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false)
// implements col-major += alpha * op(triangular) * op(general)
#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, int Mode, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
@@ -172,7 +172,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
} \
/*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/ \
/* call ?trmm*/ \
BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
\
/* Add op(a_triangular)*b into res*/ \
Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
@@ -180,13 +180,20 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
} \
};
EIGEN_BLAS_TRMM_L(double, double, d, d)
EIGEN_BLAS_TRMM_L(dcomplex, double, cd, z)
EIGEN_BLAS_TRMM_L(float, float, f, s)
EIGEN_BLAS_TRMM_L(scomplex, float, cf, c)
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_TRMM_L(double, double, d, dtrmm)
EIGEN_BLAS_TRMM_L(dcomplex, MKL_Complex16, cd, ztrmm)
EIGEN_BLAS_TRMM_L(float, float, f, strmm)
EIGEN_BLAS_TRMM_L(scomplex, MKL_Complex8, cf, ctrmm)
#else
EIGEN_BLAS_TRMM_L(double, double, d, dtrmm_)
EIGEN_BLAS_TRMM_L(dcomplex, double, cd, ztrmm_)
EIGEN_BLAS_TRMM_L(float, float, f, strmm_)
EIGEN_BLAS_TRMM_L(scomplex, float, cf, ctrmm_)
#endif
// implements col-major += alpha * op(general) * op(triangular)
#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, int Mode, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
@@ -282,7 +289,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
} \
/*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/ \
/* call ?trmm*/ \
BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
\
/* Add op(a_triangular)*b into res*/ \
Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
@@ -290,11 +297,17 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
} \
};
EIGEN_BLAS_TRMM_R(double, double, d, d)
EIGEN_BLAS_TRMM_R(dcomplex, double, cd, z)
EIGEN_BLAS_TRMM_R(float, float, f, s)
EIGEN_BLAS_TRMM_R(scomplex, float, cf, c)
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_TRMM_R(double, double, d, dtrmm)
EIGEN_BLAS_TRMM_R(dcomplex, MKL_Complex16, cd, ztrmm)
EIGEN_BLAS_TRMM_R(float, float, f, strmm)
EIGEN_BLAS_TRMM_R(scomplex, MKL_Complex8, cf, ctrmm)
#else
EIGEN_BLAS_TRMM_R(double, double, d, dtrmm_)
EIGEN_BLAS_TRMM_R(dcomplex, double, cd, ztrmm_)
EIGEN_BLAS_TRMM_R(float, float, f, strmm_)
EIGEN_BLAS_TRMM_R(scomplex, float, cf, ctrmm_)
#endif
} // end namespace internal
} // end namespace Eigen

View File

@@ -221,8 +221,9 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
* RhsBlasTraits::extractScalarFactor(rhs);
LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs);
RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs);
ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
enum {
// FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
@@ -274,6 +275,12 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
else
dest = MappedDest(actualDestPtr, dest.size());
}
if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) )
{
Index diagSize = (std::min)(lhs.rows(),lhs.cols());
dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize);
}
}
};
@@ -295,8 +302,9 @@ template<int Mode> struct trmv_selector<Mode,RowMajor>
typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
* RhsBlasTraits::extractScalarFactor(rhs);
LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs);
RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs);
ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
enum {
DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1
@@ -326,6 +334,12 @@ template<int Mode> struct trmv_selector<Mode,RowMajor>
actualRhsPtr,1,
dest.data(),dest.innerStride(),
actualAlpha);
if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) )
{
Index diagSize = (std::min)(lhs.rows(),lhs.cols());
dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize);
}
}
};

View File

@@ -71,7 +71,7 @@ EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex)
EIGEN_BLAS_TRMV_SPECIALIZE(scomplex)
// implements col-major: res += alpha * op(triangular) * vector
#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \
enum { \
@@ -121,10 +121,10 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
diag = IsUnitDiag ? 'U' : 'N'; \
\
/* call ?TRMV*/ \
BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
\
/* Add op(a_tr)rhs into res*/ \
BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
if (size<(std::max)(rows,cols)) { \
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
@@ -142,18 +142,25 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
m = convert_index<BlasIndex>(size); \
n = convert_index<BlasIndex>(cols-size); \
} \
BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \
BLASPREFIX##gemv##BLASPOSTFIX(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
} \
} \
};
EIGEN_BLAS_TRMV_CM(double, double, d, d)
EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z)
EIGEN_BLAS_TRMV_CM(float, float, f, s)
EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c)
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_TRMV_CM(double, double, d, d,)
EIGEN_BLAS_TRMV_CM(dcomplex, MKL_Complex16, cd, z,)
EIGEN_BLAS_TRMV_CM(float, float, f, s,)
EIGEN_BLAS_TRMV_CM(scomplex, MKL_Complex8, cf, c,)
#else
EIGEN_BLAS_TRMV_CM(double, double, d, d, _)
EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z, _)
EIGEN_BLAS_TRMV_CM(float, float, f, s, _)
EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c, _)
#endif
// implements row-major: res += alpha * op(triangular) * vector
#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \
enum { \
@@ -203,10 +210,10 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
diag = IsUnitDiag ? 'U' : 'N'; \
\
/* call ?TRMV*/ \
BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
\
/* Add op(a_tr)rhs into res*/ \
BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
if (size<(std::max)(rows,cols)) { \
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
@@ -224,15 +231,22 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
m = convert_index<BlasIndex>(size); \
n = convert_index<BlasIndex>(cols-size); \
} \
BLASPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \
BLASPREFIX##gemv##BLASPOSTFIX(&trans, &n, &m, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
} \
} \
};
EIGEN_BLAS_TRMV_RM(double, double, d, d)
EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z)
EIGEN_BLAS_TRMV_RM(float, float, f, s)
EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c)
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_TRMV_RM(double, double, d, d,)
EIGEN_BLAS_TRMV_RM(dcomplex, MKL_Complex16, cd, z,)
EIGEN_BLAS_TRMV_RM(float, float, f, s,)
EIGEN_BLAS_TRMV_RM(scomplex, MKL_Complex8, cf, c,)
#else
EIGEN_BLAS_TRMV_RM(double, double, d, d,_)
EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z,_)
EIGEN_BLAS_TRMV_RM(float, float, f, s,_)
EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c,_)
#endif
} // end namespase internal

View File

@@ -183,7 +183,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
}
}
/* Optimized triangular solver with multiple left hand sides and the trinagular matrix on the right
/* Optimized triangular solver with multiple left hand sides and the triangular matrix on the right
*/
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>
@@ -202,6 +202,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
level3_blocking<Scalar,Scalar>& blocking)
{
Index rows = otherSize;
typedef typename NumTraits<Scalar>::Real RealScalar;
typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper;
typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
@@ -306,9 +307,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
}
if((Mode & UnitDiag)==0)
{
Scalar b = conj(rhs(j,j));
Scalar inv_rjj = RealScalar(1)/conj(rhs(j,j));
for (Index i=0; i<actual_mc; ++i)
r[i] /= b;
r[i] *= inv_rjj;
}
}

Some files were not shown because too many files have changed in this diff Show More