mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Compare commits
145 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5e8edd2186 | ||
|
|
0ac1fc52dd | ||
|
|
6aa0143851 | ||
|
|
c7f6f8315f | ||
|
|
b0448fc6e0 | ||
|
|
3b8644da50 | ||
|
|
414c42bfcf | ||
|
|
952eda443b | ||
|
|
6a4a0b66bd | ||
|
|
079de53fa5 | ||
|
|
ce950ca2db | ||
|
|
49bd503308 | ||
|
|
5b20d9f326 | ||
|
|
5f8f69020b | ||
|
|
dc9325848a | ||
|
|
9df4c76bb8 | ||
|
|
0071c2e8a8 | ||
|
|
03727bdf55 | ||
|
|
5e39ba6642 | ||
|
|
d2ce4faa5a | ||
|
|
43b7aa2412 | ||
|
|
23b1682723 | ||
|
|
c53002f5fb | ||
|
|
ea37d9e73e | ||
|
|
ece7cec604 | ||
|
|
2e708d48ca | ||
|
|
109935bfce | ||
|
|
339d7188ed | ||
|
|
02f420012a | ||
|
|
d45ac54008 | ||
|
|
d9585478d9 | ||
|
|
01421e31a2 | ||
|
|
2f81b6363f | ||
|
|
53a7864c48 | ||
|
|
9fc3d9f3ca | ||
|
|
84911f9c05 | ||
|
|
77dc6dbb44 | ||
|
|
a36d19c4fc | ||
|
|
0fd6b4f71d | ||
|
|
52207cf6f9 | ||
|
|
0c26611d2d | ||
|
|
2a4fcb2c31 | ||
|
|
54930b6b55 | ||
|
|
4e5385c905 | ||
|
|
ac632f663e | ||
|
|
3620371c5c | ||
|
|
5dda502f84 | ||
|
|
590aec8fab | ||
|
|
75f8b06e50 | ||
|
|
e91e5d8c87 | ||
|
|
ef3cc72cb6 | ||
|
|
7a0a2a5001 | ||
|
|
bfdd4a9903 | ||
|
|
dc252fbf00 | ||
|
|
9f202c6f1e | ||
|
|
b933946d63 | ||
|
|
1b080fdcb9 | ||
|
|
a796be81a4 | ||
|
|
76f0650563 | ||
|
|
4707c3aa86 | ||
|
|
89a86ed422 | ||
|
|
f55a6d051b | ||
|
|
b343baceb4 | ||
|
|
abb5d04c3a | ||
|
|
94f66fc32e | ||
|
|
efd72cddcd | ||
|
|
def70b2e37 | ||
|
|
e5b35d64f7 | ||
|
|
02ef38020b | ||
|
|
16ed93cf61 | ||
|
|
86306a5cab | ||
|
|
1595ee4067 | ||
|
|
e22d0947c7 | ||
|
|
0dd9643ad5 | ||
|
|
14db78c53b | ||
|
|
84364ad11d | ||
|
|
160c0a3404 | ||
|
|
89449a0821 | ||
|
|
e1e35a2246 | ||
|
|
5f1082d0b1 | ||
|
|
1039348f12 | ||
|
|
a8d516b04e | ||
|
|
f1b1f13d3c | ||
|
|
a153dbae9b | ||
|
|
3d7e2a1f3d | ||
|
|
e1e9b3f9f6 | ||
|
|
3d18879fc4 | ||
|
|
cd12bf6317 | ||
|
|
1760432f6e | ||
|
|
a7d6a65d49 | ||
|
|
74d474e7df | ||
|
|
47e2f8a42c | ||
|
|
3cf273591a | ||
|
|
8ee2e10af4 | ||
|
|
165db26dc0 | ||
|
|
52d159c19f | ||
|
|
6abc9e5379 | ||
|
|
f722e43770 | ||
|
|
f483c7ea8a | ||
|
|
32cb4853c6 | ||
|
|
5c4b03c53e | ||
|
|
3a79cc3f5b | ||
|
|
71d0402e3e | ||
|
|
427f2f66d6 | ||
|
|
6ed74ac97c | ||
|
|
841d844f9c | ||
|
|
4387298e80 | ||
|
|
4f77286c3d | ||
|
|
4a242ac43d | ||
|
|
fe8cd812b0 | ||
|
|
b7249a2a42 | ||
|
|
7c42084503 | ||
|
|
8fb28db12d | ||
|
|
e777674a87 | ||
|
|
222ce4b49d | ||
|
|
4415d4e2d4 | ||
|
|
07c2081aac | ||
|
|
dd93c41618 | ||
|
|
441b3511de | ||
|
|
cf0bf73edc | ||
|
|
f719b23ef7 | ||
|
|
5110d803e4 | ||
|
|
6b59aa705a | ||
|
|
7b93328baf | ||
|
|
6adb70d3c4 | ||
|
|
c354f59fb6 | ||
|
|
637302a4c2 | ||
|
|
2aa9eb3ce8 | ||
|
|
f1c12d8ff0 | ||
|
|
6c4d57dc9e | ||
|
|
6870a39feb | ||
|
|
bb9981e24b | ||
|
|
74a0c08d70 | ||
|
|
18dc2107ea | ||
|
|
c28ba89fe2 | ||
|
|
c781bf2202 | ||
|
|
c30beb5974 | ||
|
|
7b160dcc82 | ||
|
|
c7266da750 | ||
|
|
eea99eb4e0 | ||
|
|
65a6d41510 | ||
|
|
210d510a90 | ||
|
|
549c32cb42 | ||
|
|
25a1160849 | ||
|
|
7670ff9272 |
19
.clang-format
Normal file
19
.clang-format
Normal file
@@ -0,0 +1,19 @@
|
||||
---
|
||||
BasedOnStyle: Google
|
||||
ColumnLimit: 120
|
||||
---
|
||||
Language: Cpp
|
||||
BasedOnStyle: Google
|
||||
ColumnLimit: 120
|
||||
StatementMacros:
|
||||
- EIGEN_STATIC_ASSERT
|
||||
- EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
|
||||
- EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
|
||||
SortIncludes: false
|
||||
AttributeMacros:
|
||||
- EIGEN_STRONG_INLINE
|
||||
- EIGEN_ALWAYS_INLINE
|
||||
- EIGEN_DEVICE_FUNC
|
||||
- EIGEN_DONT_INLINE
|
||||
- EIGEN_DEPRECATED
|
||||
- EIGEN_UNUSED
|
||||
5
.hgignore → .gitignore
vendored
5
.hgignore → .gitignore
vendored
@@ -1,4 +1,3 @@
|
||||
syntax: glob
|
||||
qrc_*cxx
|
||||
*.orig
|
||||
*.pyc
|
||||
@@ -28,7 +27,11 @@ activity.png
|
||||
*.rej
|
||||
log
|
||||
patch
|
||||
*.patch
|
||||
a
|
||||
a.*
|
||||
lapack/testing
|
||||
lapack/reference
|
||||
.*project
|
||||
.settings
|
||||
Makefile
|
||||
34
.gitlab-ci.yml
Normal file
34
.gitlab-ci.yml
Normal file
@@ -0,0 +1,34 @@
|
||||
# This file is part of Eigen, a lightweight C++ template library
|
||||
# for linear algebra.
|
||||
#
|
||||
# Copyright (C) 2023, The Eigen Authors
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla
|
||||
# Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
# with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
stages:
|
||||
- checkformat
|
||||
- build
|
||||
- test
|
||||
- deploy
|
||||
|
||||
variables:
|
||||
# CMake build directory.
|
||||
EIGEN_CI_BUILDDIR: .build
|
||||
# Specify the CMake build target.
|
||||
EIGEN_CI_BUILD_TARGET: ""
|
||||
# If a test regex is specified, that will be selected.
|
||||
# Otherwise, we will try a label if specified.
|
||||
EIGEN_CI_CTEST_REGEX: ""
|
||||
EIGEN_CI_CTEST_LABEL: ""
|
||||
EIGEN_CI_CTEST_ARGS: ""
|
||||
|
||||
include:
|
||||
- "/ci/checkformat.gitlab-ci.yml"
|
||||
- "/ci/common.gitlab-ci.yml"
|
||||
- "/ci/build.linux.gitlab-ci.yml"
|
||||
- "/ci/build.windows.gitlab-ci.yml"
|
||||
- "/ci/test.linux.gitlab-ci.yml"
|
||||
- "/ci/test.windows.gitlab-ci.yml"
|
||||
- "/ci/deploy.gitlab-ci.yml"
|
||||
@@ -1,7 +1,7 @@
|
||||
project(Eigen3)
|
||||
|
||||
cmake_minimum_required(VERSION 2.8.5)
|
||||
|
||||
project(Eigen3)
|
||||
|
||||
# guard against in-source builds
|
||||
|
||||
if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
|
||||
@@ -19,14 +19,6 @@ if (NOT CMAKE_BUILD_TYPE)
|
||||
set(CMAKE_BUILD_TYPE "Release")
|
||||
endif()
|
||||
|
||||
string(TOLOWER "${CMAKE_BUILD_TYPE}" cmake_build_type_tolower)
|
||||
if( NOT cmake_build_type_tolower STREQUAL "debug"
|
||||
AND NOT cmake_build_type_tolower STREQUAL "release"
|
||||
AND NOT cmake_build_type_tolower STREQUAL "relwithdebinfo")
|
||||
message(FATAL_ERROR "Unknown build type \"${CMAKE_BUILD_TYPE}\". Allowed values are Debug, Release, RelWithDebInfo (case-insensitive).")
|
||||
endif()
|
||||
|
||||
|
||||
#############################################################################
|
||||
# retrieve version infomation #
|
||||
#############################################################################
|
||||
@@ -94,6 +86,20 @@ else()
|
||||
ei_add_cxx_compiler_flag("-std=c++03")
|
||||
endif()
|
||||
|
||||
function(ei_maybe_separate_arguments variable mode args)
|
||||
# Use separate_arguments if the input is a single string containing a space.
|
||||
# Otherwise, if it is already a list or doesn't have a space, just propagate
|
||||
# the original value. This is to better support multi-argument lists.
|
||||
list(LENGTH args list_length)
|
||||
if (${list_length} EQUAL 1)
|
||||
string(FIND "${args}" " " has_space)
|
||||
if (${has_space} GREATER -1)
|
||||
separate_arguments(args ${mode} "${args}")
|
||||
endif()
|
||||
endif()
|
||||
set(${variable} ${args} PARENT_SCOPE)
|
||||
endfunction(ei_maybe_separate_arguments)
|
||||
|
||||
#############################################################################
|
||||
# find how to link to the standard libraries #
|
||||
#############################################################################
|
||||
@@ -104,6 +110,10 @@ find_package(StandardMathLibrary)
|
||||
set(EIGEN_TEST_CUSTOM_LINKER_FLAGS "" CACHE STRING "Additional linker flags when linking unit tests.")
|
||||
set(EIGEN_TEST_CUSTOM_CXX_FLAGS "" CACHE STRING "Additional compiler flags when compiling unit tests.")
|
||||
|
||||
# Convert space-separated arguments into CMake lists for downstream consumption.
|
||||
ei_maybe_separate_arguments(EIGEN_TEST_CUSTOM_LINKER_FLAGS NATIVE_COMMAND "${EIGEN_TEST_CUSTOM_LINKER_FLAGS}")
|
||||
ei_maybe_separate_arguments(EIGEN_TEST_CUSTOM_CXX_FLAGS NATIVE_COMMAND "${EIGEN_TEST_CUSTOM_CXX_FLAGS}")
|
||||
|
||||
set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "")
|
||||
|
||||
if(NOT STANDARD_MATH_LIBRARY_FOUND)
|
||||
@@ -158,7 +168,7 @@ if(NOT MSVC)
|
||||
ei_add_cxx_compiler_flag("-Wall")
|
||||
ei_add_cxx_compiler_flag("-Wextra")
|
||||
#ei_add_cxx_compiler_flag("-Weverything") # clang
|
||||
|
||||
|
||||
ei_add_cxx_compiler_flag("-Wundef")
|
||||
ei_add_cxx_compiler_flag("-Wcast-align")
|
||||
ei_add_cxx_compiler_flag("-Wchar-subscripts")
|
||||
@@ -173,29 +183,29 @@ if(NOT MSVC)
|
||||
ei_add_cxx_compiler_flag("-Wc++11-extensions")
|
||||
ei_add_cxx_compiler_flag("-Wdouble-promotion")
|
||||
# ei_add_cxx_compiler_flag("-Wconversion")
|
||||
|
||||
|
||||
# -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6
|
||||
# if(NOT CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0.0"))
|
||||
if(NOT CMAKE_COMPILER_IS_GNUCXX)
|
||||
ei_add_cxx_compiler_flag("-Wshadow")
|
||||
endif()
|
||||
|
||||
|
||||
ei_add_cxx_compiler_flag("-Wno-psabi")
|
||||
ei_add_cxx_compiler_flag("-Wno-variadic-macros")
|
||||
ei_add_cxx_compiler_flag("-Wno-long-long")
|
||||
|
||||
|
||||
ei_add_cxx_compiler_flag("-fno-check-new")
|
||||
ei_add_cxx_compiler_flag("-fno-common")
|
||||
ei_add_cxx_compiler_flag("-fstrict-aliasing")
|
||||
ei_add_cxx_compiler_flag("-wd981") # disable ICC's "operands are evaluated in unspecified order" remark
|
||||
ei_add_cxx_compiler_flag("-wd2304") # disable ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor
|
||||
|
||||
|
||||
|
||||
|
||||
# The -ansi flag must be added last, otherwise it is also used as a linker flag by check_cxx_compiler_flag making it fails
|
||||
# Moreover we should not set both -strict-ansi and -ansi
|
||||
check_cxx_compiler_flag("-strict-ansi" COMPILER_SUPPORT_STRICTANSI)
|
||||
ei_add_cxx_compiler_flag("-Qunused-arguments") # disable clang warning: argument unused during compilation: '-ansi'
|
||||
|
||||
|
||||
if(COMPILER_SUPPORT_STRICTANSI)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -strict-ansi")
|
||||
else()
|
||||
@@ -206,7 +216,7 @@ if(NOT MSVC)
|
||||
ei_add_cxx_compiler_flag("-pie")
|
||||
ei_add_cxx_compiler_flag("-fPIE")
|
||||
endif()
|
||||
|
||||
|
||||
set(CMAKE_REQUIRED_FLAGS "")
|
||||
|
||||
option(EIGEN_TEST_SSE2 "Enable/Disable SSE2 in tests/examples" OFF)
|
||||
@@ -380,6 +390,7 @@ if(EIGEN_TEST_NO_EXCEPTIONS)
|
||||
message(STATUS "Disabling exceptions in tests/examples")
|
||||
endif()
|
||||
|
||||
set(EIGEN_CUDA_CXX_FLAGS "" CACHE STRING "Additional flags to pass to the cuda compiler.")
|
||||
set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code")
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
||||
@@ -391,22 +402,27 @@ endif()
|
||||
|
||||
if(EIGEN_INCLUDE_INSTALL_DIR AND NOT INCLUDE_INSTALL_DIR)
|
||||
set(INCLUDE_INSTALL_DIR ${EIGEN_INCLUDE_INSTALL_DIR}
|
||||
CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed")
|
||||
CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed")
|
||||
else()
|
||||
set(INCLUDE_INSTALL_DIR
|
||||
"${CMAKE_INSTALL_INCLUDEDIR}/eigen3"
|
||||
CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed"
|
||||
CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed"
|
||||
)
|
||||
endif()
|
||||
set(CMAKEPACKAGE_INSTALL_DIR
|
||||
"${CMAKE_INSTALL_DATADIR}/eigen3/cmake"
|
||||
CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen3Config.cmake is installed"
|
||||
CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where Eigen3Config.cmake is installed"
|
||||
)
|
||||
set(PKGCONFIG_INSTALL_DIR
|
||||
"${CMAKE_INSTALL_DATADIR}/pkgconfig"
|
||||
CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where eigen3.pc is installed"
|
||||
CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where eigen3.pc is installed"
|
||||
)
|
||||
|
||||
foreach(var INCLUDE_INSTALL_DIR CMAKEPACKAGE_INSTALL_DIR PKGCONFIG_INSTALL_DIR)
|
||||
if(IS_ABSOLUTE "${${var}}")
|
||||
message(FATAL_ERROR "${var} must be relative to CMAKE_PREFIX_PATH. Got: ${${var}}")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
# similar to set_target_properties but append the property instead of overwriting it
|
||||
macro(ei_add_target_property target prop value)
|
||||
@@ -446,13 +462,15 @@ if(BUILD_TESTING)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
|
||||
add_subdirectory(blas)
|
||||
add_subdirectory(lapack)
|
||||
else()
|
||||
add_subdirectory(blas EXCLUDE_FROM_ALL)
|
||||
add_subdirectory(lapack EXCLUDE_FROM_ALL)
|
||||
endif()
|
||||
if (NOT CMAKE_CROSSCOMPILING)
|
||||
if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
|
||||
add_subdirectory(blas)
|
||||
add_subdirectory(lapack)
|
||||
else()
|
||||
add_subdirectory(blas EXCLUDE_FROM_ALL)
|
||||
add_subdirectory(lapack EXCLUDE_FROM_ALL)
|
||||
endif()
|
||||
endif(NOT CMAKE_CROSSCOMPILING)
|
||||
|
||||
# add SYCL
|
||||
option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
|
||||
@@ -535,6 +553,7 @@ if (NOT CMAKE_VERSION VERSION_LESS 3.0)
|
||||
|
||||
# Imported target support
|
||||
add_library (eigen INTERFACE)
|
||||
add_library (Eigen3::Eigen ALIAS eigen)
|
||||
|
||||
target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS})
|
||||
target_include_directories (eigen INTERFACE
|
||||
@@ -576,11 +595,11 @@ if (NOT CMAKE_VERSION VERSION_LESS 3.0)
|
||||
|
||||
else (NOT CMAKE_VERSION VERSION_LESS 3.0)
|
||||
# Fallback to legacy Eigen3Config.cmake without the imported target
|
||||
|
||||
|
||||
# If CMakePackageConfigHelpers module is available (CMake >= 2.8.8)
|
||||
# create a relocatable Config file, otherwise leave the hardcoded paths
|
||||
# create a relocatable Config file, otherwise leave the hardcoded paths
|
||||
include(CMakePackageConfigHelpers OPTIONAL RESULT_VARIABLE CPCH_PATH)
|
||||
|
||||
|
||||
if(CPCH_PATH)
|
||||
configure_package_config_file (
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3ConfigLegacy.cmake.in
|
||||
@@ -589,7 +608,7 @@ else (NOT CMAKE_VERSION VERSION_LESS 3.0)
|
||||
INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
|
||||
NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
|
||||
)
|
||||
else()
|
||||
else()
|
||||
# The PACKAGE_* variables are defined by the configure_package_config_file
|
||||
# but without it we define them manually to the hardcoded paths
|
||||
set(PACKAGE_INIT "")
|
||||
|
||||
@@ -4,10 +4,10 @@
|
||||
## # The following are required to uses Dart and the Cdash dashboard
|
||||
## ENABLE_TESTING()
|
||||
## INCLUDE(CTest)
|
||||
set(CTEST_PROJECT_NAME "Eigen 3.3")
|
||||
set(CTEST_PROJECT_NAME "Eigen")
|
||||
set(CTEST_NIGHTLY_START_TIME "00:00:00 UTC")
|
||||
|
||||
set(CTEST_DROP_METHOD "http")
|
||||
set(CTEST_DROP_SITE "manao.inria.fr")
|
||||
set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen+3.3")
|
||||
set(CTEST_DROP_SITE "my.cdash.org")
|
||||
set(CTEST_DROP_LOCATION "/submit.php?project=Eigen")
|
||||
set(CTEST_DROP_SITE_CDASH TRUE)
|
||||
|
||||
34
Eigen/Core
34
Eigen/Core
@@ -31,7 +31,7 @@
|
||||
#endif
|
||||
|
||||
// Handle NVCC/CUDA/SYCL
|
||||
#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__)
|
||||
#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__)
|
||||
// Do not try asserts on CUDA and SYCL!
|
||||
#ifndef EIGEN_NO_DEBUG
|
||||
#define EIGEN_NO_DEBUG
|
||||
@@ -46,7 +46,7 @@
|
||||
#endif
|
||||
|
||||
// All functions callable from CUDA code must be qualified with __device__
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
// Do not try to vectorize on CUDA and SYCL!
|
||||
#ifndef EIGEN_DONT_VECTORIZE
|
||||
#define EIGEN_DONT_VECTORIZE
|
||||
@@ -62,9 +62,16 @@
|
||||
|
||||
#else
|
||||
#define EIGEN_DEVICE_FUNC
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_CUDACC)
|
||||
#include <cuda.h>
|
||||
#define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10)
|
||||
#else
|
||||
#define EIGEN_CUDA_SDK_VER 0
|
||||
#endif
|
||||
|
||||
|
||||
// When compiling CUDA device code with NVCC, pull in math functions from the
|
||||
// global namespace. In host mode, and when device doee with clang, use the
|
||||
// std versions.
|
||||
@@ -123,7 +130,7 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_DONT_VECTORIZE
|
||||
#if !defined(EIGEN_DONT_VECTORIZE) && !defined(EIGEN_CUDACC)
|
||||
|
||||
#if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
|
||||
|
||||
@@ -213,6 +220,7 @@
|
||||
} // end extern "C"
|
||||
#elif defined __VSX__
|
||||
#define EIGEN_VECTORIZE
|
||||
#define EIGEN_VECTORIZE_FMA
|
||||
#define EIGEN_VECTORIZE_VSX
|
||||
#include <altivec.h>
|
||||
// We need to #undef all these ugly tokens defined in <altivec.h>
|
||||
@@ -222,6 +230,7 @@
|
||||
#undef pixel
|
||||
#elif defined __ALTIVEC__
|
||||
#define EIGEN_VECTORIZE
|
||||
#define EIGEN_VECTORIZE_FMA
|
||||
#define EIGEN_VECTORIZE_ALTIVEC
|
||||
#include <altivec.h>
|
||||
// We need to #undef all these ugly tokens defined in <altivec.h>
|
||||
@@ -233,6 +242,10 @@
|
||||
#define EIGEN_VECTORIZE
|
||||
#define EIGEN_VECTORIZE_NEON
|
||||
#include <arm_neon.h>
|
||||
// Enable FMA for ARM.
|
||||
#if defined(__ARM_FEATURE_FMA)
|
||||
#define EIGEN_VECTORIZE_FMA
|
||||
#endif
|
||||
#elif (defined __s390x__ && defined __VEC__)
|
||||
#define EIGEN_VECTORIZE
|
||||
#define EIGEN_VECTORIZE_ZVECTOR
|
||||
@@ -245,16 +258,16 @@
|
||||
#define EIGEN_HAS_FP16_C
|
||||
#endif
|
||||
|
||||
#if defined __CUDACC__
|
||||
#if defined EIGEN_CUDACC
|
||||
#define EIGEN_VECTORIZE_CUDA
|
||||
#include <vector_types.h>
|
||||
#if EIGEN_CUDACC_VER >= 70500
|
||||
#if EIGEN_CUDA_SDK_VER >= 70500
|
||||
#define EIGEN_HAS_CUDA_FP16
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined EIGEN_HAS_CUDA_FP16
|
||||
#include <host_defines.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
|
||||
@@ -279,7 +292,10 @@
|
||||
#include <cmath>
|
||||
#include <cassert>
|
||||
#include <functional>
|
||||
#include <iosfwd>
|
||||
#include <sstream>
|
||||
#ifndef EIGEN_NO_IO
|
||||
#include <iosfwd>
|
||||
#endif
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <limits>
|
||||
@@ -375,7 +391,9 @@ using std::ptrdiff_t;
|
||||
|
||||
#if defined EIGEN_VECTORIZE_AVX512
|
||||
#include "src/Core/arch/SSE/PacketMath.h"
|
||||
#include "src/Core/arch/SSE/MathFunctions.h"
|
||||
#include "src/Core/arch/AVX/PacketMath.h"
|
||||
#include "src/Core/arch/AVX/MathFunctions.h"
|
||||
#include "src/Core/arch/AVX512/PacketMath.h"
|
||||
#include "src/Core/arch/AVX512/MathFunctions.h"
|
||||
#elif defined EIGEN_VECTORIZE_AVX
|
||||
|
||||
@@ -10,14 +10,14 @@
|
||||
|
||||
#include "Core"
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
#include "Cholesky"
|
||||
#include "Jacobi"
|
||||
#include "Householder"
|
||||
#include "LU"
|
||||
#include "Geometry"
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
/** \defgroup Eigenvalues_Module Eigenvalues module
|
||||
*
|
||||
*
|
||||
|
||||
@@ -10,12 +10,12 @@
|
||||
|
||||
#include "Core"
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
#include "SVD"
|
||||
#include "LU"
|
||||
#include <limits>
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
/** \defgroup Geometry_Module Geometry module
|
||||
*
|
||||
* This module provides support for:
|
||||
|
||||
4
Eigen/QR
4
Eigen/QR
@@ -10,12 +10,12 @@
|
||||
|
||||
#include "Core"
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
#include "Cholesky"
|
||||
#include "Jacobi"
|
||||
#include "Householder"
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
/** \defgroup QR_Module QR module
|
||||
*
|
||||
*
|
||||
|
||||
@@ -28,7 +28,6 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include "OrderingMethods"
|
||||
#include "src/SparseCore/SparseColEtree.h"
|
||||
#include "src/SparseQR/SparseQR.h"
|
||||
|
||||
|
||||
@@ -44,7 +44,7 @@ namespace internal {
|
||||
* decomposition to determine whether a system of equations has a solution.
|
||||
*
|
||||
* This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
|
||||
*
|
||||
*
|
||||
* \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
|
||||
*/
|
||||
template<typename _MatrixType, int _UpLo> class LDLT
|
||||
@@ -558,7 +558,7 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Deri
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
template<typename _MatrixType, int _UpLo>
|
||||
template<typename RhsType, typename DstType>
|
||||
void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
|
||||
EIGEN_DEVICE_FUNC void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
|
||||
{
|
||||
eigen_assert(rhs.rows() == rows());
|
||||
// dst = P b
|
||||
|
||||
@@ -475,7 +475,7 @@ LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, c
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
template<typename _MatrixType,int _UpLo>
|
||||
template<typename RhsType, typename DstType>
|
||||
void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
|
||||
EIGEN_DEVICE_FUNC void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
|
||||
{
|
||||
dst = rhs;
|
||||
solveInPlace(dst);
|
||||
|
||||
@@ -153,8 +153,8 @@ template<typename Derived> class ArrayBase
|
||||
// inline void evalTo(Dest& dst) const { dst = matrix(); }
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC
|
||||
ArrayBase() : Base() {}
|
||||
EIGEN_DEFAULT_COPY_CONSTRUCTOR(ArrayBase)
|
||||
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(ArrayBase)
|
||||
|
||||
private:
|
||||
explicit ArrayBase(Index);
|
||||
|
||||
@@ -16,7 +16,7 @@ namespace Eigen {
|
||||
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
|
||||
::lazyAssign(const DenseBase<OtherDerived>& other)
|
||||
{
|
||||
enum{
|
||||
@@ -29,7 +29,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
|
||||
|
||||
eigen_assert(rows() == other.rows() && cols() == other.cols());
|
||||
internal::call_assignment_no_alias(derived(),other.derived());
|
||||
|
||||
|
||||
return derived();
|
||||
}
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ namespace Eigen {
|
||||
// This implementation is based on Assign.h
|
||||
|
||||
namespace internal {
|
||||
|
||||
|
||||
/***************************************************************************
|
||||
* Part 1 : the logic deciding a strategy for traversal and unrolling *
|
||||
***************************************************************************/
|
||||
@@ -29,12 +29,12 @@ struct copy_using_evaluator_traits
|
||||
{
|
||||
typedef typename DstEvaluator::XprType Dst;
|
||||
typedef typename Dst::Scalar DstScalar;
|
||||
|
||||
|
||||
enum {
|
||||
DstFlags = DstEvaluator::Flags,
|
||||
SrcFlags = SrcEvaluator::Flags
|
||||
};
|
||||
|
||||
|
||||
public:
|
||||
enum {
|
||||
DstAlignment = DstEvaluator::Alignment,
|
||||
@@ -135,7 +135,7 @@ public:
|
||||
? int(CompleteUnrolling)
|
||||
: int(NoUnrolling) )
|
||||
: int(Traversal) == int(LinearTraversal)
|
||||
? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling)
|
||||
? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling)
|
||||
: int(NoUnrolling) )
|
||||
#if EIGEN_UNALIGNED_VECTORIZE
|
||||
: int(Traversal) == int(SliceVectorizedTraversal)
|
||||
@@ -195,7 +195,7 @@ struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling
|
||||
// FIXME: this is not very clean, perhaps this information should be provided by the kernel?
|
||||
typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
|
||||
typedef typename DstEvaluatorType::XprType DstXprType;
|
||||
|
||||
|
||||
enum {
|
||||
outer = Index / DstXprType::InnerSizeAtCompileTime,
|
||||
inner = Index % DstXprType::InnerSizeAtCompileTime
|
||||
@@ -261,7 +261,7 @@ struct copy_using_evaluator_innervec_CompleteUnrolling
|
||||
typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
|
||||
typedef typename DstEvaluatorType::XprType DstXprType;
|
||||
typedef typename Kernel::PacketType PacketType;
|
||||
|
||||
|
||||
enum {
|
||||
outer = Index / DstXprType::InnerSizeAtCompileTime,
|
||||
inner = Index % DstXprType::InnerSizeAtCompileTime,
|
||||
@@ -426,7 +426,7 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrollin
|
||||
{
|
||||
typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
|
||||
typedef typename Kernel::PacketType PacketType;
|
||||
|
||||
|
||||
enum { size = DstXprType::SizeAtCompileTime,
|
||||
packetSize =unpacket_traits<PacketType>::size,
|
||||
alignedSize = (size/packetSize)*packetSize };
|
||||
@@ -599,14 +599,14 @@ protected:
|
||||
typedef typename DstEvaluatorTypeT::XprType DstXprType;
|
||||
typedef typename SrcEvaluatorTypeT::XprType SrcXprType;
|
||||
public:
|
||||
|
||||
|
||||
typedef DstEvaluatorTypeT DstEvaluatorType;
|
||||
typedef SrcEvaluatorTypeT SrcEvaluatorType;
|
||||
typedef typename DstEvaluatorType::Scalar Scalar;
|
||||
typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits;
|
||||
typedef typename AssignmentTraits::PacketType PacketType;
|
||||
|
||||
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
|
||||
: m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr)
|
||||
{
|
||||
@@ -614,58 +614,58 @@ public:
|
||||
AssignmentTraits::debug();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC Index size() const { return m_dstExpr.size(); }
|
||||
EIGEN_DEVICE_FUNC Index innerSize() const { return m_dstExpr.innerSize(); }
|
||||
EIGEN_DEVICE_FUNC Index outerSize() const { return m_dstExpr.outerSize(); }
|
||||
EIGEN_DEVICE_FUNC Index rows() const { return m_dstExpr.rows(); }
|
||||
EIGEN_DEVICE_FUNC Index cols() const { return m_dstExpr.cols(); }
|
||||
EIGEN_DEVICE_FUNC Index outerStride() const { return m_dstExpr.outerStride(); }
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() { return m_dst; }
|
||||
EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const { return m_src; }
|
||||
|
||||
|
||||
/// Assign src(row,col) to dst(row,col) through the assignment functor.
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col)
|
||||
{
|
||||
m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col));
|
||||
}
|
||||
|
||||
|
||||
/// \sa assignCoeff(Index,Index)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index index)
|
||||
{
|
||||
m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index));
|
||||
}
|
||||
|
||||
|
||||
/// \sa assignCoeff(Index,Index)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner)
|
||||
{
|
||||
Index row = rowIndexByOuterInner(outer, inner);
|
||||
Index col = colIndexByOuterInner(outer, inner);
|
||||
Index row = rowIndexByOuterInner(outer, inner);
|
||||
Index col = colIndexByOuterInner(outer, inner);
|
||||
assignCoeff(row, col);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
template<int StoreMode, int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index row, Index col)
|
||||
{
|
||||
m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode,PacketType>(row,col));
|
||||
}
|
||||
|
||||
|
||||
template<int StoreMode, int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index index)
|
||||
{
|
||||
m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode,PacketType>(index));
|
||||
}
|
||||
|
||||
|
||||
template<int StoreMode, int LoadMode, typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner)
|
||||
{
|
||||
Index row = rowIndexByOuterInner(outer, inner);
|
||||
Index row = rowIndexByOuterInner(outer, inner);
|
||||
Index col = colIndexByOuterInner(outer, inner);
|
||||
assignPacket<StoreMode,LoadMode,PacketType>(row, col);
|
||||
}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner)
|
||||
{
|
||||
typedef typename DstEvaluatorType::ExpressionTraits Traits;
|
||||
@@ -688,7 +688,7 @@ public:
|
||||
{
|
||||
return m_dstExpr.data();
|
||||
}
|
||||
|
||||
|
||||
protected:
|
||||
DstEvaluatorType& m_dst;
|
||||
const SrcEvaluatorType& m_src;
|
||||
@@ -734,7 +734,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType
|
||||
resize_if_allowed(dst, src, func);
|
||||
|
||||
DstEvaluatorType dstEvaluator(dst);
|
||||
|
||||
|
||||
typedef generic_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
|
||||
Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
|
||||
|
||||
@@ -762,7 +762,7 @@ struct EigenBase2EigenBase {};
|
||||
|
||||
template<typename,typename> struct AssignmentKind { typedef EigenBase2EigenBase Kind; };
|
||||
template<> struct AssignmentKind<DenseShape,DenseShape> { typedef Dense2Dense Kind; };
|
||||
|
||||
|
||||
// This is the main assignment class
|
||||
template< typename DstXprType, typename SrcXprType, typename Functor,
|
||||
typename Kind = typename AssignmentKind< typename evaluator_traits<DstXprType>::Shape , typename evaluator_traits<SrcXprType>::Shape >::Kind,
|
||||
@@ -787,7 +787,7 @@ void call_assignment(const Dst& dst, const Src& src)
|
||||
{
|
||||
call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
|
||||
}
|
||||
|
||||
|
||||
// Deal with "assume-aliasing"
|
||||
template<typename Dst, typename Src, typename Func>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
@@ -827,12 +827,12 @@ void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
|
||||
typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst>::type ActualDstTypeCleaned;
|
||||
typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst&>::type ActualDstType;
|
||||
ActualDstType actualDst(dst);
|
||||
|
||||
|
||||
// TODO check whether this is the right place to perform these checks:
|
||||
EIGEN_STATIC_ASSERT_LVALUE(Dst)
|
||||
EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned,Src)
|
||||
EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar);
|
||||
|
||||
|
||||
Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func);
|
||||
}
|
||||
template<typename Dst, typename Src>
|
||||
@@ -869,13 +869,12 @@ template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, con
|
||||
template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
|
||||
struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak>
|
||||
{
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
|
||||
{
|
||||
#ifndef EIGEN_NO_DEBUG
|
||||
internal::check_for_aliasing(dst, src);
|
||||
#endif
|
||||
|
||||
|
||||
call_dense_assignment_loop(dst, src, func);
|
||||
}
|
||||
};
|
||||
@@ -887,8 +886,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak>
|
||||
template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
|
||||
struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
|
||||
{
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
|
||||
{
|
||||
Index dstRows = src.rows();
|
||||
Index dstCols = src.cols();
|
||||
|
||||
@@ -23,7 +23,7 @@ struct all_unroller
|
||||
row = (UnrollCount-1) % Traits::RowsAtCompileTime
|
||||
};
|
||||
|
||||
static inline bool run(const Derived &mat)
|
||||
EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
|
||||
{
|
||||
return all_unroller<Derived, UnrollCount-1>::run(mat) && mat.coeff(row, col);
|
||||
}
|
||||
@@ -32,13 +32,13 @@ struct all_unroller
|
||||
template<typename Derived>
|
||||
struct all_unroller<Derived, 0>
|
||||
{
|
||||
static inline bool run(const Derived &/*mat*/) { return true; }
|
||||
EIGEN_DEVICE_FUNC static inline bool run(const Derived &/*mat*/) { return true; }
|
||||
};
|
||||
|
||||
template<typename Derived>
|
||||
struct all_unroller<Derived, Dynamic>
|
||||
{
|
||||
static inline bool run(const Derived &) { return false; }
|
||||
EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; }
|
||||
};
|
||||
|
||||
template<typename Derived, int UnrollCount>
|
||||
@@ -50,7 +50,7 @@ struct any_unroller
|
||||
row = (UnrollCount-1) % Traits::RowsAtCompileTime
|
||||
};
|
||||
|
||||
static inline bool run(const Derived &mat)
|
||||
EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
|
||||
{
|
||||
return any_unroller<Derived, UnrollCount-1>::run(mat) || mat.coeff(row, col);
|
||||
}
|
||||
@@ -59,13 +59,13 @@ struct any_unroller
|
||||
template<typename Derived>
|
||||
struct any_unroller<Derived, 0>
|
||||
{
|
||||
static inline bool run(const Derived & /*mat*/) { return false; }
|
||||
EIGEN_DEVICE_FUNC static inline bool run(const Derived & /*mat*/) { return false; }
|
||||
};
|
||||
|
||||
template<typename Derived>
|
||||
struct any_unroller<Derived, Dynamic>
|
||||
{
|
||||
static inline bool run(const Derived &) { return false; }
|
||||
EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; }
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
@@ -78,7 +78,7 @@ struct any_unroller<Derived, Dynamic>
|
||||
* \sa any(), Cwise::operator<()
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline bool DenseBase<Derived>::all() const
|
||||
EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
|
||||
{
|
||||
typedef internal::evaluator<Derived> Evaluator;
|
||||
enum {
|
||||
@@ -102,7 +102,7 @@ inline bool DenseBase<Derived>::all() const
|
||||
* \sa all()
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline bool DenseBase<Derived>::any() const
|
||||
EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
|
||||
{
|
||||
typedef internal::evaluator<Derived> Evaluator;
|
||||
enum {
|
||||
@@ -126,7 +126,7 @@ inline bool DenseBase<Derived>::any() const
|
||||
* \sa all(), any()
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline Eigen::Index DenseBase<Derived>::count() const
|
||||
EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase<Derived>::count() const
|
||||
{
|
||||
return derived().template cast<bool>().template cast<Index>().sum();
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
#ifndef EIGEN_COMMAINITIALIZER_H
|
||||
#define EIGEN_COMMAINITIALIZER_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
/** \class CommaInitializer
|
||||
* \ingroup Core_Module
|
||||
@@ -44,7 +44,7 @@ struct CommaInitializer
|
||||
m_xpr.block(0, 0, other.rows(), other.cols()) = other;
|
||||
}
|
||||
|
||||
/* Copy/Move constructor which transfers ownership. This is crucial in
|
||||
/* Copy/Move constructor which transfers ownership. This is crucial in
|
||||
* absence of return value optimization to avoid assertions during destruction. */
|
||||
// FIXME in C++11 mode this could be replaced by a proper RValue constructor
|
||||
EIGEN_DEVICE_FUNC
|
||||
@@ -135,13 +135,13 @@ struct CommaInitializer
|
||||
*
|
||||
* Example: \include MatrixBase_set.cpp
|
||||
* Output: \verbinclude MatrixBase_set.out
|
||||
*
|
||||
*
|
||||
* \note According the c++ standard, the argument expressions of this comma initializer are evaluated in arbitrary order.
|
||||
*
|
||||
* \sa CommaInitializer::finished(), class CommaInitializer
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
|
||||
EIGEN_DEVICE_FUNC inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
|
||||
{
|
||||
return CommaInitializer<Derived>(*static_cast<Derived*>(this), s);
|
||||
}
|
||||
@@ -149,7 +149,7 @@ inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s
|
||||
/** \sa operator<<(const Scalar&) */
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
inline CommaInitializer<Derived>
|
||||
EIGEN_DEVICE_FUNC inline CommaInitializer<Derived>
|
||||
DenseBase<Derived>::operator<<(const DenseBase<OtherDerived>& other)
|
||||
{
|
||||
return CommaInitializer<Derived>(*static_cast<Derived *>(this), other);
|
||||
|
||||
@@ -74,7 +74,7 @@ class CwiseBinaryOpImpl;
|
||||
* \sa MatrixBase::binaryExpr(const MatrixBase<OtherDerived> &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp
|
||||
*/
|
||||
template<typename BinaryOp, typename LhsType, typename RhsType>
|
||||
class CwiseBinaryOp :
|
||||
class CwiseBinaryOp :
|
||||
public CwiseBinaryOpImpl<
|
||||
BinaryOp, LhsType, RhsType,
|
||||
typename internal::cwise_promote_storage_type<typename internal::traits<LhsType>::StorageKind,
|
||||
@@ -83,7 +83,7 @@ class CwiseBinaryOp :
|
||||
internal::no_assignment_operator
|
||||
{
|
||||
public:
|
||||
|
||||
|
||||
typedef typename internal::remove_all<BinaryOp>::type Functor;
|
||||
typedef typename internal::remove_all<LhsType>::type Lhs;
|
||||
typedef typename internal::remove_all<RhsType>::type Rhs;
|
||||
@@ -158,7 +158,7 @@ public:
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_STRONG_INLINE Derived &
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
|
||||
MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
|
||||
{
|
||||
call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
|
||||
@@ -171,7 +171,7 @@ MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_STRONG_INLINE Derived &
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
|
||||
MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
|
||||
{
|
||||
call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
|
||||
|
||||
@@ -126,12 +126,12 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
|
||||
*
|
||||
* Here is an example with C++11 random generators: \include random_cpp11.cpp
|
||||
* Output: \verbinclude random_cpp11.out
|
||||
*
|
||||
*
|
||||
* \sa class CwiseNullaryOp
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename CustomNullaryOp>
|
||||
EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
|
||||
DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
|
||||
@@ -170,7 +170,7 @@ DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
|
||||
* \sa class CwiseNullaryOp
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
|
||||
{
|
||||
return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value));
|
||||
@@ -272,7 +272,7 @@ DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
|
||||
}
|
||||
|
||||
/**
|
||||
* \copydoc DenseBase::LinSpaced(Index, const Scalar&, const Scalar&)
|
||||
* \copydoc DenseBase::LinSpaced(Index, const DenseBase::Scalar&, const DenseBase::Scalar&)
|
||||
* Special version for fixed size types which does not require the size parameter.
|
||||
*/
|
||||
template<typename Derived>
|
||||
|
||||
@@ -121,6 +121,8 @@ class CwiseUnaryViewImpl<ViewOp,MatrixType,Dense>
|
||||
{
|
||||
return derived().nestedExpression().outerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
|
||||
}
|
||||
protected:
|
||||
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(CwiseUnaryViewImpl)
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
@@ -40,7 +40,7 @@ static inline void check_DenseIndex_is_signed() {
|
||||
*/
|
||||
template<typename Derived> class DenseBase
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
: public DenseCoeffsBase<Derived>
|
||||
: public DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value>
|
||||
#else
|
||||
: public DenseCoeffsBase<Derived,DirectWriteAccessors>
|
||||
#endif // not EIGEN_PARSED_BY_DOXYGEN
|
||||
@@ -71,7 +71,7 @@ template<typename Derived> class DenseBase
|
||||
typedef Scalar value_type;
|
||||
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
typedef DenseCoeffsBase<Derived> Base;
|
||||
typedef DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value> Base;
|
||||
|
||||
using Base::derived;
|
||||
using Base::const_cast_derived;
|
||||
@@ -587,11 +587,12 @@ template<typename Derived> class DenseBase
|
||||
}
|
||||
|
||||
protected:
|
||||
EIGEN_DEFAULT_COPY_CONSTRUCTOR(DenseBase)
|
||||
/** Default constructor. Do nothing. */
|
||||
EIGEN_DEVICE_FUNC DenseBase()
|
||||
{
|
||||
/* Just checks for self-consistency of the flags.
|
||||
* Only do it when debugging Eigen, as this borders on paranoiac and could slow compilation down
|
||||
* Only do it when debugging Eigen, as this borders on paranoia and could slow compilation down
|
||||
*/
|
||||
#ifdef EIGEN_INTERNAL_DEBUGGING
|
||||
EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, int(IsRowMajor))
|
||||
|
||||
@@ -404,7 +404,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
|
||||
if(size != m_rows*m_cols)
|
||||
{
|
||||
internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols);
|
||||
if (size)
|
||||
if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
|
||||
m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
|
||||
else
|
||||
m_data = 0;
|
||||
@@ -479,7 +479,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
|
||||
if(size != _Rows*m_cols)
|
||||
{
|
||||
internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols);
|
||||
if (size)
|
||||
if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
|
||||
m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
|
||||
else
|
||||
m_data = 0;
|
||||
@@ -553,7 +553,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
|
||||
if(size != m_rows*_Cols)
|
||||
{
|
||||
internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows);
|
||||
if (size)
|
||||
if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
|
||||
m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
|
||||
else
|
||||
m_data = 0;
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
#ifndef EIGEN_DIAGONAL_H
|
||||
#define EIGEN_DIAGONAL_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
/** \class Diagonal
|
||||
* \ingroup Core_Module
|
||||
@@ -149,8 +149,8 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const typename internal::remove_all<typename MatrixType::Nested>::type&
|
||||
nestedExpression() const
|
||||
inline const typename internal::remove_all<typename MatrixType::Nested>::type&
|
||||
nestedExpression() const
|
||||
{
|
||||
return m_matrix;
|
||||
}
|
||||
@@ -187,7 +187,7 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
|
||||
*
|
||||
* \sa class Diagonal */
|
||||
template<typename Derived>
|
||||
inline typename MatrixBase<Derived>::DiagonalReturnType
|
||||
EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalReturnType
|
||||
MatrixBase<Derived>::diagonal()
|
||||
{
|
||||
return DiagonalReturnType(derived());
|
||||
@@ -195,7 +195,7 @@ MatrixBase<Derived>::diagonal()
|
||||
|
||||
/** This is the const version of diagonal(). */
|
||||
template<typename Derived>
|
||||
inline typename MatrixBase<Derived>::ConstDiagonalReturnType
|
||||
EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalReturnType
|
||||
MatrixBase<Derived>::diagonal() const
|
||||
{
|
||||
return ConstDiagonalReturnType(derived());
|
||||
@@ -213,7 +213,7 @@ MatrixBase<Derived>::diagonal() const
|
||||
*
|
||||
* \sa MatrixBase::diagonal(), class Diagonal */
|
||||
template<typename Derived>
|
||||
inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
|
||||
EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
|
||||
MatrixBase<Derived>::diagonal(Index index)
|
||||
{
|
||||
return DiagonalDynamicIndexReturnType(derived(), index);
|
||||
@@ -221,7 +221,7 @@ MatrixBase<Derived>::diagonal(Index index)
|
||||
|
||||
/** This is the const version of diagonal(Index). */
|
||||
template<typename Derived>
|
||||
inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
|
||||
EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
|
||||
MatrixBase<Derived>::diagonal(Index index) const
|
||||
{
|
||||
return ConstDiagonalDynamicIndexReturnType(derived(), index);
|
||||
@@ -240,7 +240,7 @@ MatrixBase<Derived>::diagonal(Index index) const
|
||||
* \sa MatrixBase::diagonal(), class Diagonal */
|
||||
template<typename Derived>
|
||||
template<int Index_>
|
||||
inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
|
||||
EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
|
||||
MatrixBase<Derived>::diagonal()
|
||||
{
|
||||
return typename DiagonalIndexReturnType<Index_>::Type(derived());
|
||||
@@ -249,7 +249,7 @@ MatrixBase<Derived>::diagonal()
|
||||
/** This is the const version of diagonal<int>(). */
|
||||
template<typename Derived>
|
||||
template<int Index_>
|
||||
inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
|
||||
EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
|
||||
MatrixBase<Derived>::diagonal() const
|
||||
{
|
||||
return typename ConstDiagonalIndexReturnType<Index_>::Type(derived());
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
#ifndef EIGEN_DIAGONALMATRIX_H
|
||||
#define EIGEN_DIAGONALMATRIX_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
template<typename Derived>
|
||||
@@ -44,7 +44,7 @@ class DiagonalBase : public EigenBase<Derived>
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseMatrixType toDenseMatrix() const { return derived(); }
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
|
||||
EIGEN_DEVICE_FUNC
|
||||
@@ -70,7 +70,7 @@ class DiagonalBase : public EigenBase<Derived>
|
||||
{
|
||||
return InverseReturnType(diagonal().cwiseInverse());
|
||||
}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const DiagonalWrapper<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DiagonalVectorType,Scalar,product) >
|
||||
operator*(const Scalar& scalar) const
|
||||
@@ -273,7 +273,7 @@ class DiagonalWrapper
|
||||
* \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal()
|
||||
**/
|
||||
template<typename Derived>
|
||||
inline const DiagonalWrapper<const Derived>
|
||||
EIGEN_DEVICE_FUNC inline const DiagonalWrapper<const Derived>
|
||||
MatrixBase<Derived>::asDiagonal() const
|
||||
{
|
||||
return DiagonalWrapper<const Derived>(derived());
|
||||
@@ -318,20 +318,20 @@ template<> struct AssignmentKind<DenseShape,DiagonalShape> { typedef Diagonal2De
|
||||
template< typename DstXprType, typename SrcXprType, typename Functor>
|
||||
struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense>
|
||||
{
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
|
||||
static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
|
||||
{
|
||||
Index dstRows = src.rows();
|
||||
Index dstCols = src.cols();
|
||||
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
|
||||
dst.resize(dstRows, dstCols);
|
||||
|
||||
|
||||
dst.setZero();
|
||||
dst.diagonal() = src.diagonal();
|
||||
}
|
||||
|
||||
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
|
||||
{ dst.diagonal() += src.diagonal(); }
|
||||
|
||||
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
|
||||
{ dst.diagonal() -= src.diagonal(); }
|
||||
};
|
||||
|
||||
@@ -17,7 +17,7 @@ namespace Eigen {
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename DiagonalDerived>
|
||||
inline const Product<Derived, DiagonalDerived, LazyProduct>
|
||||
EIGEN_DEVICE_FUNC inline const Product<Derived, DiagonalDerived, LazyProduct>
|
||||
MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
|
||||
{
|
||||
return Product<Derived, DiagonalDerived, LazyProduct>(derived(),a_diagonal.derived());
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#ifndef EIGEN_DOT_H
|
||||
#define EIGEN_DOT_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
@@ -78,7 +78,7 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
|
||||
typedef internal::scalar_conj_product_op<Scalar,typename OtherDerived::Scalar> func;
|
||||
EIGEN_CHECK_BINARY_COMPATIBILIY(func,Scalar,typename OtherDerived::Scalar);
|
||||
#endif
|
||||
|
||||
|
||||
eigen_assert(size() == other.size());
|
||||
|
||||
return internal::dot_nocheck<Derived,OtherDerived>::run(*this, other);
|
||||
@@ -86,14 +86,14 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
|
||||
|
||||
//---------- implementation of L2 norm and related functions ----------
|
||||
|
||||
/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the Frobenius norm.
|
||||
/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the squared Frobenius norm.
|
||||
* In both cases, it consists in the sum of the square of all the matrix entries.
|
||||
* For vectors, this is also equals to the dot product of \c *this with itself.
|
||||
*
|
||||
* \sa dot(), norm(), lpNorm()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
|
||||
{
|
||||
return numext::real((*this).cwiseAbs2().sum());
|
||||
}
|
||||
@@ -105,7 +105,7 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
|
||||
* \sa lpNorm(), dot(), squaredNorm()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
|
||||
{
|
||||
return numext::sqrt(squaredNorm());
|
||||
}
|
||||
@@ -120,7 +120,7 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
|
||||
* \sa norm(), normalize()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
|
||||
MatrixBase<Derived>::normalized() const
|
||||
{
|
||||
typedef typename internal::nested_eval<Derived,2>::type _Nested;
|
||||
@@ -142,7 +142,7 @@ MatrixBase<Derived>::normalized() const
|
||||
* \sa norm(), normalized()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
|
||||
{
|
||||
RealScalar z = squaredNorm();
|
||||
// NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
|
||||
@@ -163,7 +163,7 @@ EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
|
||||
* \sa stableNorm(), stableNormalize(), normalized()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
|
||||
MatrixBase<Derived>::stableNormalized() const
|
||||
{
|
||||
typedef typename internal::nested_eval<Derived,3>::type _Nested;
|
||||
@@ -188,7 +188,7 @@ MatrixBase<Derived>::stableNormalized() const
|
||||
* \sa stableNorm(), stableNormalized(), normalize()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
|
||||
{
|
||||
RealScalar w = cwiseAbs().maxCoeff();
|
||||
RealScalar z = (derived()/w).squaredNorm();
|
||||
@@ -260,9 +260,9 @@ struct lpNorm_selector<Derived, Infinity>
|
||||
template<typename Derived>
|
||||
template<int p>
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
|
||||
EIGEN_DEVICE_FUNC inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
|
||||
#else
|
||||
MatrixBase<Derived>::RealScalar
|
||||
EIGEN_DEVICE_FUNC MatrixBase<Derived>::RealScalar
|
||||
#endif
|
||||
MatrixBase<Derived>::lpNorm() const
|
||||
{
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
#ifndef EIGEN_FUZZY_H
|
||||
#define EIGEN_FUZZY_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal
|
||||
{
|
||||
@@ -100,7 +100,7 @@ struct isMuchSmallerThan_scalar_selector<Derived, true>
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
bool DenseBase<Derived>::isApprox(
|
||||
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(
|
||||
const DenseBase<OtherDerived>& other,
|
||||
const RealScalar& prec
|
||||
) const
|
||||
@@ -122,7 +122,7 @@ bool DenseBase<Derived>::isApprox(
|
||||
* \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const
|
||||
*/
|
||||
template<typename Derived>
|
||||
bool DenseBase<Derived>::isMuchSmallerThan(
|
||||
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
|
||||
const typename NumTraits<Scalar>::Real& other,
|
||||
const RealScalar& prec
|
||||
) const
|
||||
@@ -142,7 +142,7 @@ bool DenseBase<Derived>::isMuchSmallerThan(
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
bool DenseBase<Derived>::isMuchSmallerThan(
|
||||
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
|
||||
const DenseBase<OtherDerived>& other,
|
||||
const RealScalar& prec
|
||||
) const
|
||||
|
||||
@@ -207,12 +207,12 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
|
||||
typedef typename Rhs::Scalar RhsScalar;
|
||||
typedef typename Dest::Scalar ResScalar;
|
||||
typedef typename Dest::RealScalar RealScalar;
|
||||
|
||||
|
||||
typedef internal::blas_traits<Lhs> LhsBlasTraits;
|
||||
typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
|
||||
typedef internal::blas_traits<Rhs> RhsBlasTraits;
|
||||
typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
|
||||
|
||||
|
||||
typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
|
||||
|
||||
ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
|
||||
@@ -300,7 +300,7 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,true>
|
||||
typedef typename Lhs::Scalar LhsScalar;
|
||||
typedef typename Rhs::Scalar RhsScalar;
|
||||
typedef typename Dest::Scalar ResScalar;
|
||||
|
||||
|
||||
typedef internal::blas_traits<Lhs> LhsBlasTraits;
|
||||
typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
|
||||
typedef internal::blas_traits<Rhs> RhsBlasTraits;
|
||||
@@ -386,7 +386,7 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
inline const Product<Derived, OtherDerived>
|
||||
EIGEN_DEVICE_FUNC inline const Product<Derived, OtherDerived>
|
||||
MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
|
||||
{
|
||||
// A note regarding the function declaration: In MSVC, this function will sometimes
|
||||
@@ -428,7 +428,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
const Product<Derived,OtherDerived,LazyProduct>
|
||||
EIGEN_DEVICE_FUNC const Product<Derived,OtherDerived,LazyProduct>
|
||||
MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
|
||||
{
|
||||
enum {
|
||||
|
||||
@@ -237,7 +237,7 @@ ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
|
||||
* For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and
|
||||
* replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]}
|
||||
* Currently, this function is only used in matrix products.
|
||||
* For packet-size smaller or equal to 4, this function is equivalent to pload1
|
||||
* For packet-size smaller or equal to 4, this function is equivalent to pload1
|
||||
*/
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
ploadquad(const typename unpacket_traits<Packet>::type* from)
|
||||
@@ -299,7 +299,7 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
|
||||
template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
#if defined(__LP64__)
|
||||
#if defined(__LP64__) || EIGEN_OS_WIN64
|
||||
// 64-bit pointer operand constraint for inlined asm
|
||||
asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
|
||||
#else
|
||||
@@ -351,10 +351,7 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet&
|
||||
/** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
|
||||
{
|
||||
// FIXME: uncomment the following in case we drop the internal imag and real functions.
|
||||
// using std::imag;
|
||||
// using std::real;
|
||||
return Packet(imag(a),real(a));
|
||||
return Packet(a.imag(),a.real());
|
||||
}
|
||||
|
||||
/**************************
|
||||
@@ -362,77 +359,77 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet
|
||||
***************************/
|
||||
|
||||
/** \internal \returns the sine of \a a (coeff-wise) */
|
||||
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet psin(const Packet& a) { using std::sin; return sin(a); }
|
||||
|
||||
/** \internal \returns the cosine of \a a (coeff-wise) */
|
||||
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet pcos(const Packet& a) { using std::cos; return cos(a); }
|
||||
|
||||
/** \internal \returns the tan of \a a (coeff-wise) */
|
||||
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet ptan(const Packet& a) { using std::tan; return tan(a); }
|
||||
|
||||
/** \internal \returns the arc sine of \a a (coeff-wise) */
|
||||
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet pasin(const Packet& a) { using std::asin; return asin(a); }
|
||||
|
||||
/** \internal \returns the arc cosine of \a a (coeff-wise) */
|
||||
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet pacos(const Packet& a) { using std::acos; return acos(a); }
|
||||
|
||||
/** \internal \returns the arc tangent of \a a (coeff-wise) */
|
||||
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet patan(const Packet& a) { using std::atan; return atan(a); }
|
||||
|
||||
/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
|
||||
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet psinh(const Packet& a) { using std::sinh; return sinh(a); }
|
||||
|
||||
/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
|
||||
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet pcosh(const Packet& a) { using std::cosh; return cosh(a); }
|
||||
|
||||
/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
|
||||
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet ptanh(const Packet& a) { using std::tanh; return tanh(a); }
|
||||
|
||||
/** \internal \returns the exp of \a a (coeff-wise) */
|
||||
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet pexp(const Packet& a) { using std::exp; return exp(a); }
|
||||
|
||||
/** \internal \returns the log of \a a (coeff-wise) */
|
||||
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet plog(const Packet& a) { using std::log; return log(a); }
|
||||
|
||||
/** \internal \returns the log1p of \a a (coeff-wise) */
|
||||
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet plog1p(const Packet& a) { return numext::log1p(a); }
|
||||
|
||||
/** \internal \returns the log10 of \a a (coeff-wise) */
|
||||
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet plog10(const Packet& a) { using std::log10; return log10(a); }
|
||||
|
||||
/** \internal \returns the square-root of \a a (coeff-wise) */
|
||||
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); }
|
||||
|
||||
/** \internal \returns the reciprocal square-root of \a a (coeff-wise) */
|
||||
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet prsqrt(const Packet& a) {
|
||||
return pdiv(pset1<Packet>(1), psqrt(a));
|
||||
}
|
||||
|
||||
/** \internal \returns the rounded value of \a a (coeff-wise) */
|
||||
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet pround(const Packet& a) { using numext::round; return round(a); }
|
||||
|
||||
/** \internal \returns the floor of \a a (coeff-wise) */
|
||||
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet pfloor(const Packet& a) { using numext::floor; return floor(a); }
|
||||
|
||||
/** \internal \returns the ceil of \a a (coeff-wise) */
|
||||
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); }
|
||||
|
||||
/***************************************************************************
|
||||
@@ -497,14 +494,14 @@ struct palign_impl
|
||||
|
||||
/** \internal update \a first using the concatenation of the packet_size minus \a Offset last elements
|
||||
* of \a first and \a Offset first elements of \a second.
|
||||
*
|
||||
*
|
||||
* This function is currently only used to optimize matrix-vector products on unligned matrices.
|
||||
* It takes 2 packets that represent a contiguous memory array, and returns a packet starting
|
||||
* at the position \a Offset. For instance, for packets of 4 elements, we have:
|
||||
* Input:
|
||||
* - first = {f0,f1,f2,f3}
|
||||
* - second = {s0,s1,s2,s3}
|
||||
* Output:
|
||||
* Output:
|
||||
* - if Offset==0 then {f0,f1,f2,f3}
|
||||
* - if Offset==1 then {f1,f2,f3,s0}
|
||||
* - if Offset==2 then {f2,f3,s0,s1}
|
||||
@@ -521,13 +518,13 @@ inline void palign(PacketType& first, const PacketType& second)
|
||||
***************************************************************************/
|
||||
|
||||
// Eigen+CUDA does not support complexes.
|
||||
#ifndef __CUDACC__
|
||||
#ifndef EIGEN_CUDACC
|
||||
|
||||
template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
|
||||
{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
|
||||
{ return std::complex<float>(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); }
|
||||
|
||||
template<> inline std::complex<double> pmul(const std::complex<double>& a, const std::complex<double>& b)
|
||||
{ return std::complex<double>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
|
||||
{ return std::complex<double>(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); }
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -182,6 +182,8 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
|
||||
#endif
|
||||
|
||||
protected:
|
||||
EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase)
|
||||
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase)
|
||||
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC
|
||||
@@ -294,6 +296,9 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
|
||||
// In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base,
|
||||
// see bugs 821 and 920.
|
||||
using ReadOnlyMapBase::Base::operator=;
|
||||
protected:
|
||||
EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase)
|
||||
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase)
|
||||
};
|
||||
|
||||
#undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS
|
||||
|
||||
@@ -287,7 +287,7 @@ struct abs2_impl_default<Scalar, true> // IsComplex
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline RealScalar run(const Scalar& x)
|
||||
{
|
||||
return real(x)*real(x) + imag(x)*imag(x);
|
||||
return x.real()*x.real() + x.imag()*x.imag();
|
||||
}
|
||||
};
|
||||
|
||||
@@ -313,14 +313,17 @@ struct abs2_retval
|
||||
****************************************************************************/
|
||||
|
||||
template<typename Scalar, bool IsComplex>
|
||||
struct norm1_default_impl
|
||||
struct norm1_default_impl;
|
||||
|
||||
template<typename Scalar>
|
||||
struct norm1_default_impl<Scalar,true>
|
||||
{
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline RealScalar run(const Scalar& x)
|
||||
{
|
||||
EIGEN_USING_STD_MATH(abs);
|
||||
return abs(real(x)) + abs(imag(x));
|
||||
return abs(x.real()) + abs(x.imag());
|
||||
}
|
||||
};
|
||||
|
||||
@@ -458,6 +461,65 @@ struct arg_retval
|
||||
typedef typename NumTraits<Scalar>::Real type;
|
||||
};
|
||||
|
||||
/****************************************************************************
|
||||
* Implementation of expm1 *
|
||||
****************************************************************************/
|
||||
|
||||
// This implementation is based on GSL Math's expm1.
|
||||
namespace std_fallback {
|
||||
// fallback expm1 implementation in case there is no expm1(Scalar) function in namespace of Scalar,
|
||||
// or that there is no suitable std::expm1 function available. Implementation
|
||||
// attributed to Kahan. See: http://www.plunk.org/~hatch/rightway.php.
|
||||
template<typename Scalar>
|
||||
EIGEN_DEVICE_FUNC inline Scalar expm1(const Scalar& x) {
|
||||
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
|
||||
EIGEN_USING_STD_MATH(exp);
|
||||
Scalar u = exp(x);
|
||||
if (numext::equal_strict(u, Scalar(1))) {
|
||||
return x;
|
||||
}
|
||||
Scalar um1 = u - RealScalar(1);
|
||||
if (numext::equal_strict(um1, Scalar(-1))) {
|
||||
return RealScalar(-1);
|
||||
}
|
||||
|
||||
EIGEN_USING_STD_MATH(log);
|
||||
return (u - RealScalar(1)) * x / log(u);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Scalar>
|
||||
struct expm1_impl {
|
||||
EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
|
||||
#if EIGEN_HAS_CXX11_MATH
|
||||
using std::expm1;
|
||||
#else
|
||||
using std_fallback::expm1;
|
||||
#endif
|
||||
return expm1(x);
|
||||
}
|
||||
};
|
||||
|
||||
// Specialization for complex types that are not supported by std::expm1.
|
||||
template <typename RealScalar>
|
||||
struct expm1_impl<std::complex<RealScalar> > {
|
||||
EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
|
||||
const std::complex<RealScalar>& x) {
|
||||
EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
|
||||
return std_fallback::expm1(x);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Scalar>
|
||||
struct expm1_retval
|
||||
{
|
||||
typedef Scalar type;
|
||||
};
|
||||
|
||||
/****************************************************************************
|
||||
* Implementation of log1p *
|
||||
****************************************************************************/
|
||||
@@ -477,17 +539,27 @@ namespace std_fallback {
|
||||
|
||||
template<typename Scalar>
|
||||
struct log1p_impl {
|
||||
static inline Scalar run(const Scalar& x)
|
||||
static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
|
||||
#if EIGEN_HAS_CXX11_MATH
|
||||
using std::log1p;
|
||||
#endif
|
||||
#else
|
||||
using std_fallback::log1p;
|
||||
#endif
|
||||
return log1p(x);
|
||||
}
|
||||
};
|
||||
|
||||
// Specialization for complex types that are not supported by std::log1p.
|
||||
template <typename RealScalar>
|
||||
struct log1p_impl<std::complex<RealScalar> > {
|
||||
EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
|
||||
const std::complex<RealScalar>& x) {
|
||||
EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
|
||||
return std_fallback::log1p(x);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Scalar>
|
||||
struct log1p_retval
|
||||
@@ -552,19 +624,6 @@ struct random_retval
|
||||
template<typename Scalar> inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random(const Scalar& x, const Scalar& y);
|
||||
template<typename Scalar> inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random();
|
||||
|
||||
template<typename Scalar>
|
||||
struct random_default_impl<Scalar, false, false>
|
||||
{
|
||||
static inline Scalar run(const Scalar& x, const Scalar& y)
|
||||
{
|
||||
return x + (y-x) * Scalar(std::rand()) / Scalar(RAND_MAX);
|
||||
}
|
||||
static inline Scalar run()
|
||||
{
|
||||
return run(Scalar(NumTraits<Scalar>::IsSigned ? -1 : 0), Scalar(1));
|
||||
}
|
||||
};
|
||||
|
||||
enum {
|
||||
meta_floor_log2_terminate,
|
||||
meta_floor_log2_move_up,
|
||||
@@ -612,6 +671,38 @@ struct meta_floor_log2<n, lower, upper, meta_floor_log2_bogus>
|
||||
// no value, error at compile time
|
||||
};
|
||||
|
||||
#define EIGEN_RAND_MAX INT_MAX
|
||||
// Fill a signed positive int with random bits.
|
||||
// This is to overcome issues in MSVC which limits RAND_MAX to 32767.
|
||||
inline int random_int() {
|
||||
#if RAND_MAX == INT_MAX
|
||||
return std::rand();
|
||||
#else
|
||||
enum {
|
||||
rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX)+1>::value,
|
||||
int_bits = meta_floor_log2<(unsigned int)(INT_MAX)+1>::value,
|
||||
};
|
||||
unsigned int out = std::rand();
|
||||
for (int bit = rand_bits; bit < int(int_bits); bit += rand_bits) {
|
||||
out = (out << rand_bits) ^ std::rand();
|
||||
}
|
||||
return static_cast<int>(out & INT_MAX);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename Scalar>
|
||||
struct random_default_impl<Scalar, false, false>
|
||||
{
|
||||
static inline Scalar run(const Scalar& x, const Scalar& y)
|
||||
{
|
||||
return x + (y-x) * Scalar(random_int()) / Scalar(EIGEN_RAND_MAX);
|
||||
}
|
||||
static inline Scalar run()
|
||||
{
|
||||
return run(Scalar(NumTraits<Scalar>::IsSigned ? -1 : 0), Scalar(1));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Scalar>
|
||||
struct random_default_impl<Scalar, false, true>
|
||||
{
|
||||
@@ -632,12 +723,12 @@ struct random_default_impl<Scalar, false, true>
|
||||
ScalarX offset = 0;
|
||||
ScalarX divisor = 1;
|
||||
ScalarX multiplier = 1;
|
||||
const unsigned rand_max = RAND_MAX;
|
||||
const unsigned rand_max = EIGEN_RAND_MAX;
|
||||
if (range <= rand_max) divisor = (rand_max + 1) / (range + 1);
|
||||
else multiplier = 1 + range / (rand_max + 1);
|
||||
// Rejection sampling.
|
||||
do {
|
||||
offset = (unsigned(std::rand()) * multiplier) / divisor;
|
||||
offset = (unsigned(random_int()) * multiplier) / divisor;
|
||||
} while (offset > range);
|
||||
return Scalar(ScalarX(x) + offset);
|
||||
}
|
||||
@@ -647,12 +738,12 @@ struct random_default_impl<Scalar, false, true>
|
||||
#ifdef EIGEN_MAKING_DOCS
|
||||
return run(Scalar(NumTraits<Scalar>::IsSigned ? -10 : 0), Scalar(10));
|
||||
#else
|
||||
enum { rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX)+1>::value,
|
||||
enum { rand_bits = meta_floor_log2<(unsigned int)(EIGEN_RAND_MAX)+1>::value,
|
||||
scalar_bits = sizeof(Scalar) * CHAR_BIT,
|
||||
shift = EIGEN_PLAIN_ENUM_MAX(0, int(rand_bits) - int(scalar_bits)),
|
||||
offset = NumTraits<Scalar>::IsSigned ? (1 << (EIGEN_PLAIN_ENUM_MIN(rand_bits,scalar_bits)-1)) : 0
|
||||
};
|
||||
return Scalar((std::rand() >> shift) - offset);
|
||||
return Scalar((random_int() >> shift) - offset);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
@@ -662,8 +753,8 @@ struct random_default_impl<Scalar, true, false>
|
||||
{
|
||||
static inline Scalar run(const Scalar& x, const Scalar& y)
|
||||
{
|
||||
return Scalar(random(real(x), real(y)),
|
||||
random(imag(x), imag(y)));
|
||||
return Scalar(random(x.real(), y.real()),
|
||||
random(x.imag(), y.imag()));
|
||||
}
|
||||
static inline Scalar run()
|
||||
{
|
||||
@@ -916,6 +1007,9 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
|
||||
return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline bool abs2(bool x) { return x; }
|
||||
|
||||
template<typename Scalar>
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
|
||||
@@ -937,7 +1031,7 @@ inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
|
||||
return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float log1p(const float &x) { return ::log1pf(x); }
|
||||
|
||||
@@ -971,7 +1065,7 @@ T (floor)(const T& x)
|
||||
return floor(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float floor(const float &x) { return ::floorf(x); }
|
||||
|
||||
@@ -987,7 +1081,7 @@ T (ceil)(const T& x)
|
||||
return ceil(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float ceil(const float &x) { return ::ceilf(x); }
|
||||
|
||||
@@ -1035,7 +1129,7 @@ T log(const T &x) {
|
||||
return log(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float log(const float &x) { return ::logf(x); }
|
||||
|
||||
@@ -1063,7 +1157,7 @@ EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); }
|
||||
EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float abs(const float &x) { return ::fabsf(x); }
|
||||
|
||||
@@ -1088,7 +1182,7 @@ T exp(const T &x) {
|
||||
return exp(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float exp(const float &x) { return ::expf(x); }
|
||||
|
||||
@@ -1103,7 +1197,7 @@ T cos(const T &x) {
|
||||
return cos(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float cos(const float &x) { return ::cosf(x); }
|
||||
|
||||
@@ -1118,7 +1212,7 @@ T sin(const T &x) {
|
||||
return sin(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float sin(const float &x) { return ::sinf(x); }
|
||||
|
||||
@@ -1133,7 +1227,7 @@ T tan(const T &x) {
|
||||
return tan(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float tan(const float &x) { return ::tanf(x); }
|
||||
|
||||
@@ -1148,7 +1242,7 @@ T acos(const T &x) {
|
||||
return acos(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float acos(const float &x) { return ::acosf(x); }
|
||||
|
||||
@@ -1163,7 +1257,7 @@ T asin(const T &x) {
|
||||
return asin(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float asin(const float &x) { return ::asinf(x); }
|
||||
|
||||
@@ -1178,7 +1272,7 @@ T atan(const T &x) {
|
||||
return atan(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float atan(const float &x) { return ::atanf(x); }
|
||||
|
||||
@@ -1194,7 +1288,7 @@ T cosh(const T &x) {
|
||||
return cosh(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float cosh(const float &x) { return ::coshf(x); }
|
||||
|
||||
@@ -1209,7 +1303,7 @@ T sinh(const T &x) {
|
||||
return sinh(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float sinh(const float &x) { return ::sinhf(x); }
|
||||
|
||||
@@ -1224,12 +1318,12 @@ T tanh(const T &x) {
|
||||
return tanh(x);
|
||||
}
|
||||
|
||||
#if (!defined(__CUDACC__)) && EIGEN_FAST_MATH
|
||||
#if (!defined(EIGEN_CUDACC)) && EIGEN_FAST_MATH
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float tanh(float x) { return internal::generic_fast_tanh_float(x); }
|
||||
#endif
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float tanh(const float &x) { return ::tanhf(x); }
|
||||
|
||||
@@ -1244,7 +1338,7 @@ T fmod(const T& a, const T& b) {
|
||||
return fmod(a, b);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float fmod(const float& a, const float& b) {
|
||||
|
||||
@@ -464,7 +464,8 @@ template<typename Derived> class MatrixBase
|
||||
EIGEN_MATRIX_FUNCTION_1(MatrixComplexPowerReturnValue, pow, power to \c p, const std::complex<RealScalar>& p)
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC MatrixBase() : Base() {}
|
||||
EIGEN_DEFAULT_COPY_CONSTRUCTOR(MatrixBase)
|
||||
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MatrixBase)
|
||||
|
||||
private:
|
||||
EIGEN_DEVICE_FUNC explicit MatrixBase(int);
|
||||
|
||||
@@ -99,7 +99,7 @@ template<typename ExpressionType> class NestByValue
|
||||
/** \returns an expression of the temporary version of *this.
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline const NestByValue<Derived>
|
||||
EIGEN_DEVICE_FUNC inline const NestByValue<Derived>
|
||||
DenseBase<Derived>::nestByValue() const
|
||||
{
|
||||
return NestByValue<Derived>(derived());
|
||||
|
||||
@@ -54,34 +54,34 @@ struct default_digits10_impl<T,false,true> // Integer
|
||||
*
|
||||
* The provided data consists of:
|
||||
* \li A typedef \c Real, giving the "real part" type of \a T. If \a T is already real,
|
||||
* then \c Real is just a typedef to \a T. If \a T is \c std::complex<U> then \c Real
|
||||
* then \c Real is just a typedef to \a T. If \a T is `std::complex<U>` then \c Real
|
||||
* is a typedef to \a U.
|
||||
* \li A typedef \c NonInteger, giving the type that should be used for operations producing non-integral values,
|
||||
* such as quotients, square roots, etc. If \a T is a floating-point type, then this typedef just gives
|
||||
* \a T again. Note however that many Eigen functions such as internal::sqrt simply refuse to
|
||||
* take integers. Outside of a few cases, Eigen doesn't do automatic type promotion. Thus, this typedef is
|
||||
* only intended as a helper for code that needs to explicitly promote types.
|
||||
* \li A typedef \c Literal giving the type to use for numeric literals such as "2" or "0.5". For instance, for \c std::complex<U>, Literal is defined as \c U.
|
||||
* \li A typedef \c Literal giving the type to use for numeric literals such as "2" or "0.5". For instance, for `std::complex<U>`, Literal is defined as \c U.
|
||||
* Of course, this type must be fully compatible with \a T. In doubt, just use \a T here.
|
||||
* \li A typedef \a Nested giving the type to use to nest a value inside of the expression tree. If you don't know what
|
||||
* \li A typedef \c Nested giving the type to use to nest a value inside of the expression tree. If you don't know what
|
||||
* this means, just use \a T here.
|
||||
* \li An enum value \a IsComplex. It is equal to 1 if \a T is a \c std::complex
|
||||
* \li An enum value \c IsComplex. It is equal to 1 if \a T is a `std::complex`
|
||||
* type, and to 0 otherwise.
|
||||
* \li An enum value \a IsInteger. It is equal to \c 1 if \a T is an integer type such as \c int,
|
||||
* \li An enum value \c IsInteger. It is equal to \c 1 if \a T is an integer type such as \c int,
|
||||
* and to \c 0 otherwise.
|
||||
* \li Enum values ReadCost, AddCost and MulCost representing a rough estimate of the number of CPU cycles needed
|
||||
* \li Enum values \c ReadCost, \c AddCost and \c MulCost representing a rough estimate of the number of CPU cycles needed
|
||||
* to by move / add / mul instructions respectively, assuming the data is already stored in CPU registers.
|
||||
* Stay vague here. No need to do architecture-specific stuff.
|
||||
* \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
|
||||
* \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
|
||||
* \li An enum value \c IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
|
||||
* \li An enum value \c RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
|
||||
* be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise.
|
||||
* \li An epsilon() function which, unlike <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/epsilon">std::numeric_limits::epsilon()</a>,
|
||||
* \li An `epsilon()` function which, unlike <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/epsilon">`std::numeric_limits::epsilon()`</a>,
|
||||
* it returns a \a Real instead of a \a T.
|
||||
* \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default
|
||||
* \li A `dummy_precision()` function returning a weak epsilon value. It is mainly used as a default
|
||||
* value by the fuzzy comparison operators.
|
||||
* \li highest() and lowest() functions returning the highest and lowest possible values respectively.
|
||||
* \li digits10() function returning the number of decimal digits that can be represented without change. This is
|
||||
* the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits10">std::numeric_limits<T>::digits10</a>
|
||||
* \li `highest()` and `lowest()` functions returning the highest and lowest possible values respectively.
|
||||
* \li `digits10()` function returning the number of decimal digits that can be represented without change. This is
|
||||
* the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits10">`std::numeric_limits<T>::digits10`</a>
|
||||
* which is used as the default implementation if specialized.
|
||||
*/
|
||||
|
||||
@@ -166,7 +166,16 @@ template<> struct NumTraits<double> : GenericNumTraits<double>
|
||||
template<> struct NumTraits<long double>
|
||||
: GenericNumTraits<long double>
|
||||
{
|
||||
static inline long double dummy_precision() { return 1e-15l; }
|
||||
static inline long double dummy_precision() { return static_cast<long double>(1e-15l); }
|
||||
|
||||
#if defined(EIGEN_ARCH_PPC) && (__LDBL_MANT_DIG__ == 106)
|
||||
// PowerPC double double causes issues with some values
|
||||
static inline long double epsilon()
|
||||
{
|
||||
// 2^(-(__LDBL_MANT_DIG__)+1)
|
||||
return static_cast<long double>(2.4651903288156618919116517665087e-32l);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
template<typename _Real> struct NumTraits<std::complex<_Real> >
|
||||
|
||||
@@ -87,17 +87,6 @@ class PermutationBase : public EigenBase<Derived>
|
||||
return derived();
|
||||
}
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
/** This is a special case of the templated operator=. Its purpose is to
|
||||
* prevent a default operator= from hiding the templated operator=.
|
||||
*/
|
||||
Derived& operator=(const PermutationBase& other)
|
||||
{
|
||||
indices() = other.indices();
|
||||
return derived();
|
||||
}
|
||||
#endif
|
||||
|
||||
/** \returns the number of rows */
|
||||
inline Index rows() const { return Index(indices().size()); }
|
||||
|
||||
@@ -333,12 +322,6 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
|
||||
inline PermutationMatrix(const PermutationBase<OtherDerived>& other)
|
||||
: m_indices(other.indices()) {}
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
/** Standard copy constructor. Defined only to prevent a default copy constructor
|
||||
* from hiding the other templated constructor */
|
||||
inline PermutationMatrix(const PermutationMatrix& other) : m_indices(other.indices()) {}
|
||||
#endif
|
||||
|
||||
/** Generic constructor from expression of the indices. The indices
|
||||
* array has the meaning that the permutations sends each integer i to indices[i].
|
||||
*
|
||||
@@ -373,17 +356,6 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
|
||||
return Base::operator=(tr.derived());
|
||||
}
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
/** This is a special case of the templated operator=. Its purpose is to
|
||||
* prevent a default operator= from hiding the templated operator=.
|
||||
*/
|
||||
PermutationMatrix& operator=(const PermutationMatrix& other)
|
||||
{
|
||||
m_indices = other.m_indices;
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
|
||||
/** const version of indices(). */
|
||||
const IndicesType& indices() const { return m_indices; }
|
||||
/** \returns a reference to the stored array representing the permutation. */
|
||||
|
||||
@@ -737,8 +737,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(bool(NumTraits<T0>::IsInteger) &&
|
||||
bool(NumTraits<T1>::IsInteger),
|
||||
const bool t0_is_integer_alike = internal::is_valid_index_type<T0>::value;
|
||||
const bool t1_is_integer_alike = internal::is_valid_index_type<T1>::value;
|
||||
EIGEN_STATIC_ASSERT(t0_is_integer_alike &&
|
||||
t1_is_integer_alike,
|
||||
FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
|
||||
resize(rows,cols);
|
||||
}
|
||||
@@ -773,9 +775,9 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
|
||||
&& ((!internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>::type* = 0)
|
||||
{
|
||||
// NOTE MSVC 2008 complains if we directly put bool(NumTraits<T>::IsInteger) as the EIGEN_STATIC_ASSERT argument.
|
||||
const bool is_integer = NumTraits<T>::IsInteger;
|
||||
EIGEN_UNUSED_VARIABLE(is_integer);
|
||||
EIGEN_STATIC_ASSERT(is_integer,
|
||||
const bool is_integer_alike = internal::is_valid_index_type<T>::value;
|
||||
EIGEN_UNUSED_VARIABLE(is_integer_alike);
|
||||
EIGEN_STATIC_ASSERT(is_integer_alike,
|
||||
FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
|
||||
resize(size);
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
#define EIGEN_PRODUCTEVALUATORS_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
|
||||
namespace internal {
|
||||
|
||||
/** \internal
|
||||
@@ -22,19 +22,19 @@ namespace internal {
|
||||
* Since products require special treatments to handle all possible cases,
|
||||
* we simply deffer the evaluation logic to a product_evaluator class
|
||||
* which offers more partial specialization possibilities.
|
||||
*
|
||||
*
|
||||
* \sa class product_evaluator
|
||||
*/
|
||||
template<typename Lhs, typename Rhs, int Options>
|
||||
struct evaluator<Product<Lhs, Rhs, Options> >
|
||||
struct evaluator<Product<Lhs, Rhs, Options> >
|
||||
: public product_evaluator<Product<Lhs, Rhs, Options> >
|
||||
{
|
||||
typedef Product<Lhs, Rhs, Options> XprType;
|
||||
typedef product_evaluator<XprType> Base;
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
|
||||
};
|
||||
|
||||
|
||||
// Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
|
||||
// TODO we should apply that rule only if that's really helpful
|
||||
template<typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
|
||||
@@ -62,12 +62,12 @@ struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
|
||||
|
||||
|
||||
template<typename Lhs, typename Rhs, int DiagIndex>
|
||||
struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> >
|
||||
struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> >
|
||||
: public evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> >
|
||||
{
|
||||
typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
|
||||
typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
|
||||
: Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
|
||||
Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
|
||||
@@ -108,23 +108,23 @@ struct product_evaluator<Product<Lhs, Rhs, Options>, ProductTag, LhsShape, RhsSh
|
||||
: m_result(xpr.rows(), xpr.cols())
|
||||
{
|
||||
::new (static_cast<Base*>(this)) Base(m_result);
|
||||
|
||||
|
||||
// FIXME shall we handle nested_eval here?,
|
||||
// if so, then we must take care at removing the call to nested_eval in the specializations (e.g., in permutation_matrix_product, transposition_matrix_product, etc.)
|
||||
// typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
|
||||
// typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
|
||||
// typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
|
||||
// typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
|
||||
//
|
||||
//
|
||||
// const LhsNested lhs(xpr.lhs());
|
||||
// const RhsNested rhs(xpr.rhs());
|
||||
//
|
||||
//
|
||||
// generic_product_impl<LhsNestedCleaned, RhsNestedCleaned>::evalTo(m_result, lhs, rhs);
|
||||
|
||||
generic_product_impl<Lhs, Rhs, LhsShape, RhsShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
|
||||
}
|
||||
|
||||
protected:
|
||||
|
||||
protected:
|
||||
PlainObject m_result;
|
||||
};
|
||||
|
||||
@@ -137,7 +137,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scal
|
||||
typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
|
||||
{
|
||||
typedef Product<Lhs,Rhs,Options> SrcXprType;
|
||||
static EIGEN_STRONG_INLINE
|
||||
static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
|
||||
void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
|
||||
{
|
||||
Index dstRows = src.rows();
|
||||
@@ -155,7 +155,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<
|
||||
typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
|
||||
{
|
||||
typedef Product<Lhs,Rhs,Options> SrcXprType;
|
||||
static EIGEN_STRONG_INLINE
|
||||
static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
|
||||
void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &)
|
||||
{
|
||||
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
|
||||
@@ -170,7 +170,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<
|
||||
typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
|
||||
{
|
||||
typedef Product<Lhs,Rhs,Options> SrcXprType;
|
||||
static EIGEN_STRONG_INLINE
|
||||
static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
|
||||
void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &)
|
||||
{
|
||||
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
|
||||
@@ -190,7 +190,7 @@ struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_product_op<ScalarBi
|
||||
typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>,
|
||||
const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
|
||||
const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
|
||||
static EIGEN_STRONG_INLINE
|
||||
static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
|
||||
void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
|
||||
{
|
||||
call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func);
|
||||
@@ -250,13 +250,13 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
|
||||
{
|
||||
dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
|
||||
}
|
||||
|
||||
|
||||
template<typename Dst>
|
||||
static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
|
||||
}
|
||||
|
||||
|
||||
template<typename Dst>
|
||||
static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{ dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
|
||||
@@ -298,7 +298,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
|
||||
{
|
||||
template<typename T> struct is_row_major : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
|
||||
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
|
||||
|
||||
|
||||
// TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
|
||||
struct set { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } };
|
||||
struct add { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
|
||||
@@ -310,31 +310,31 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
|
||||
dst.const_cast_derived() += m_scale * src;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<typename Dst>
|
||||
static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
|
||||
}
|
||||
|
||||
|
||||
template<typename Dst>
|
||||
static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
|
||||
}
|
||||
|
||||
|
||||
template<typename Dst>
|
||||
static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
|
||||
}
|
||||
|
||||
|
||||
template<typename Dst>
|
||||
static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
{
|
||||
internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
@@ -343,7 +343,7 @@ template<typename Lhs, typename Rhs, typename Derived>
|
||||
struct generic_product_impl_base
|
||||
{
|
||||
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
|
||||
|
||||
|
||||
template<typename Dst>
|
||||
static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{ dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }
|
||||
@@ -355,7 +355,7 @@ struct generic_product_impl_base
|
||||
template<typename Dst>
|
||||
static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{ scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
|
||||
|
||||
|
||||
template<typename Dst>
|
||||
static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
{ Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }
|
||||
@@ -385,32 +385,58 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
|
||||
};
|
||||
|
||||
template<typename Lhs, typename Rhs>
|
||||
struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
|
||||
struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
|
||||
{
|
||||
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
|
||||
|
||||
|
||||
template<typename Dst>
|
||||
static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
// Same as: dst.noalias() = lhs.lazyProduct(rhs);
|
||||
// but easier on the compiler side
|
||||
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
|
||||
}
|
||||
|
||||
|
||||
template<typename Dst>
|
||||
static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
// dst.noalias() += lhs.lazyProduct(rhs);
|
||||
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>());
|
||||
}
|
||||
|
||||
|
||||
template<typename Dst>
|
||||
static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
// dst.noalias() -= lhs.lazyProduct(rhs);
|
||||
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
|
||||
}
|
||||
|
||||
|
||||
// Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor:
|
||||
// dst {,+,-}= s * (A.lazyProduct(B))
|
||||
// This is a huge benefit for heap-allocated matrix types as it save one costly allocation.
|
||||
// For them, this strategy is also faster than simply by-passing the heap allocation through
|
||||
// stack allocation.
|
||||
// For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower,
|
||||
// and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only,
|
||||
// that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
|
||||
template<typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
|
||||
const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
|
||||
{
|
||||
call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
|
||||
}
|
||||
|
||||
// Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
|
||||
// overload more specialized.
|
||||
template<typename Dst, typename LhsT, typename Func>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
|
||||
{
|
||||
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
|
||||
}
|
||||
|
||||
|
||||
// template<typename Dst>
|
||||
// static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
// { dst.noalias() += alpha * lhs.lazyProduct(rhs); }
|
||||
@@ -471,7 +497,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
|
||||
|
||||
typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
|
||||
typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
|
||||
|
||||
|
||||
typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
|
||||
typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
|
||||
|
||||
@@ -490,7 +516,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
|
||||
typedef typename find_best_packet<Scalar,ColsAtCompileTime>::type RhsVecPacketType;
|
||||
|
||||
enum {
|
||||
|
||||
|
||||
LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
|
||||
RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
|
||||
CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost
|
||||
@@ -499,10 +525,10 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
|
||||
+ (InnerSize - 1) * NumTraits<Scalar>::AddCost,
|
||||
|
||||
Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
|
||||
|
||||
|
||||
LhsFlags = LhsEtorType::Flags,
|
||||
RhsFlags = RhsEtorType::Flags,
|
||||
|
||||
|
||||
LhsRowMajor = LhsFlags & RowMajorBit,
|
||||
RhsRowMajor = RhsFlags & RowMajorBit,
|
||||
|
||||
@@ -512,7 +538,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
|
||||
// Here, we don't care about alignment larger than the usable packet size.
|
||||
LhsAlignment = EIGEN_PLAIN_ENUM_MIN(LhsEtorType::Alignment,LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))),
|
||||
RhsAlignment = EIGEN_PLAIN_ENUM_MIN(RhsEtorType::Alignment,RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))),
|
||||
|
||||
|
||||
SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value,
|
||||
|
||||
CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime!=1),
|
||||
@@ -527,7 +553,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
|
||||
// TODO enable vectorization for mixed types
|
||||
| (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0)
|
||||
| (XprType::IsVectorAtCompileTime ? LinearAccessBit : 0),
|
||||
|
||||
|
||||
LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
|
||||
RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),
|
||||
|
||||
@@ -546,7 +572,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
|
||||
&& (LhsFlags & RhsFlags & ActualPacketAccessBit)
|
||||
&& (InnerSize % packet_traits<Scalar>::size == 0)
|
||||
};
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const
|
||||
{
|
||||
return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
|
||||
@@ -585,7 +611,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
|
||||
protected:
|
||||
typename internal::add_const_on_value_type<LhsNested>::type m_lhs;
|
||||
typename internal::add_const_on_value_type<RhsNested>::type m_rhs;
|
||||
|
||||
|
||||
LhsEtorType m_lhsImpl;
|
||||
RhsEtorType m_rhsImpl;
|
||||
|
||||
@@ -704,7 +730,7 @@ struct generic_product_impl<Lhs,Rhs,TriangularShape,DenseShape,ProductTag>
|
||||
: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,TriangularShape,DenseShape,ProductTag> >
|
||||
{
|
||||
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
|
||||
|
||||
|
||||
template<typename Dest>
|
||||
static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
{
|
||||
@@ -718,7 +744,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,TriangularShape,ProductTag>
|
||||
: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,TriangularShape,ProductTag> >
|
||||
{
|
||||
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
|
||||
|
||||
|
||||
template<typename Dest>
|
||||
static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
{
|
||||
@@ -739,7 +765,7 @@ struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
|
||||
: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag> >
|
||||
{
|
||||
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
|
||||
|
||||
|
||||
template<typename Dest>
|
||||
static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
{
|
||||
@@ -752,7 +778,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag>
|
||||
: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag> >
|
||||
{
|
||||
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
|
||||
|
||||
|
||||
template<typename Dest>
|
||||
static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
{
|
||||
@@ -764,7 +790,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag>
|
||||
/***************************************************************************
|
||||
* Diagonal products
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
template<typename MatrixType, typename DiagonalType, typename Derived, int ProductOrder>
|
||||
struct diagonal_product_evaluator_base
|
||||
: evaluator_base<Derived>
|
||||
@@ -773,7 +799,7 @@ struct diagonal_product_evaluator_base
|
||||
public:
|
||||
enum {
|
||||
CoeffReadCost = NumTraits<Scalar>::MulCost + evaluator<MatrixType>::CoeffReadCost + evaluator<DiagonalType>::CoeffReadCost,
|
||||
|
||||
|
||||
MatrixFlags = evaluator<MatrixType>::Flags,
|
||||
DiagFlags = evaluator<DiagonalType>::Flags,
|
||||
_StorageOrder = MatrixFlags & RowMajorBit ? RowMajor : ColMajor,
|
||||
@@ -791,14 +817,14 @@ public:
|
||||
|| (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft)
|
||||
|| (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
|
||||
};
|
||||
|
||||
|
||||
diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
|
||||
: m_diagImpl(diag), m_matImpl(mat)
|
||||
{
|
||||
EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
|
||||
EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
|
||||
}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
|
||||
{
|
||||
if(AsScalarProduct)
|
||||
@@ -806,7 +832,7 @@ public:
|
||||
else
|
||||
return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
|
||||
}
|
||||
|
||||
|
||||
protected:
|
||||
template<int LoadMode,typename PacketType>
|
||||
EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::true_type) const
|
||||
@@ -814,7 +840,7 @@ protected:
|
||||
return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
|
||||
internal::pset1<PacketType>(m_diagImpl.coeff(id)));
|
||||
}
|
||||
|
||||
|
||||
template<int LoadMode,typename PacketType>
|
||||
EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::false_type) const
|
||||
{
|
||||
@@ -825,7 +851,7 @@ protected:
|
||||
return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
|
||||
m_diagImpl.template packet<DiagonalPacketLoadMode,PacketType>(id));
|
||||
}
|
||||
|
||||
|
||||
evaluator<DiagonalType> m_diagImpl;
|
||||
evaluator<MatrixType> m_matImpl;
|
||||
};
|
||||
@@ -840,10 +866,10 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
|
||||
using Base::m_matImpl;
|
||||
using Base::coeff;
|
||||
typedef typename Base::Scalar Scalar;
|
||||
|
||||
|
||||
typedef Product<Lhs, Rhs, ProductKind> XprType;
|
||||
typedef typename XprType::PlainObject PlainObject;
|
||||
|
||||
|
||||
enum {
|
||||
StorageOrder = int(Rhs::Flags) & RowMajorBit ? RowMajor : ColMajor
|
||||
};
|
||||
@@ -852,12 +878,12 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
|
||||
: Base(xpr.rhs(), xpr.lhs().diagonal())
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
|
||||
{
|
||||
return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
|
||||
}
|
||||
|
||||
|
||||
#ifndef __CUDACC__
|
||||
template<int LoadMode,typename PacketType>
|
||||
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
|
||||
@@ -867,7 +893,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
|
||||
return this->template packet_impl<LoadMode,PacketType>(row,col, row,
|
||||
typename internal::conditional<int(StorageOrder)==RowMajor, internal::true_type, internal::false_type>::type());
|
||||
}
|
||||
|
||||
|
||||
template<int LoadMode,typename PacketType>
|
||||
EIGEN_STRONG_INLINE PacketType packet(Index idx) const
|
||||
{
|
||||
@@ -886,22 +912,22 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
|
||||
using Base::m_matImpl;
|
||||
using Base::coeff;
|
||||
typedef typename Base::Scalar Scalar;
|
||||
|
||||
|
||||
typedef Product<Lhs, Rhs, ProductKind> XprType;
|
||||
typedef typename XprType::PlainObject PlainObject;
|
||||
|
||||
|
||||
enum { StorageOrder = int(Lhs::Flags) & RowMajorBit ? RowMajor : ColMajor };
|
||||
|
||||
EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
|
||||
: Base(xpr.lhs(), xpr.rhs().diagonal())
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
|
||||
{
|
||||
return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
|
||||
}
|
||||
|
||||
|
||||
#ifndef __CUDACC__
|
||||
template<int LoadMode,typename PacketType>
|
||||
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
|
||||
@@ -909,7 +935,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
|
||||
return this->template packet_impl<LoadMode,PacketType>(row,col, col,
|
||||
typename internal::conditional<int(StorageOrder)==ColMajor, internal::true_type, internal::false_type>::type());
|
||||
}
|
||||
|
||||
|
||||
template<int LoadMode,typename PacketType>
|
||||
EIGEN_STRONG_INLINE PacketType packet(Index idx) const
|
||||
{
|
||||
@@ -991,7 +1017,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
|
||||
struct generic_product_impl<Lhs, Rhs, PermutationShape, MatrixShape, ProductTag>
|
||||
{
|
||||
template<typename Dest>
|
||||
static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
permutation_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
|
||||
}
|
||||
@@ -1001,7 +1027,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
|
||||
struct generic_product_impl<Lhs, Rhs, MatrixShape, PermutationShape, ProductTag>
|
||||
{
|
||||
template<typename Dest>
|
||||
static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
permutation_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
|
||||
}
|
||||
@@ -1011,7 +1037,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
|
||||
struct generic_product_impl<Inverse<Lhs>, Rhs, PermutationShape, MatrixShape, ProductTag>
|
||||
{
|
||||
template<typename Dest>
|
||||
static void evalTo(Dest& dst, const Inverse<Lhs>& lhs, const Rhs& rhs)
|
||||
static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Inverse<Lhs>& lhs, const Rhs& rhs)
|
||||
{
|
||||
permutation_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
|
||||
}
|
||||
@@ -1021,7 +1047,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
|
||||
struct generic_product_impl<Lhs, Inverse<Rhs>, MatrixShape, PermutationShape, ProductTag>
|
||||
{
|
||||
template<typename Dest>
|
||||
static void evalTo(Dest& dst, const Lhs& lhs, const Inverse<Rhs>& rhs)
|
||||
static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Lhs& lhs, const Inverse<Rhs>& rhs)
|
||||
{
|
||||
permutation_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
|
||||
}
|
||||
@@ -1043,7 +1069,7 @@ struct transposition_matrix_product
|
||||
{
|
||||
typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
|
||||
typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
|
||||
|
||||
|
||||
template<typename Dest, typename TranspositionType>
|
||||
static inline void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr)
|
||||
{
|
||||
@@ -1068,7 +1094,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
|
||||
struct generic_product_impl<Lhs, Rhs, TranspositionsShape, MatrixShape, ProductTag>
|
||||
{
|
||||
template<typename Dest>
|
||||
static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
transposition_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
|
||||
}
|
||||
@@ -1078,7 +1104,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
|
||||
struct generic_product_impl<Lhs, Rhs, MatrixShape, TranspositionsShape, ProductTag>
|
||||
{
|
||||
template<typename Dest>
|
||||
static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
transposition_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
|
||||
}
|
||||
@@ -1089,7 +1115,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
|
||||
struct generic_product_impl<Transpose<Lhs>, Rhs, TranspositionsShape, MatrixShape, ProductTag>
|
||||
{
|
||||
template<typename Dest>
|
||||
static void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
|
||||
static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
|
||||
{
|
||||
transposition_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
|
||||
}
|
||||
@@ -1099,7 +1125,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
|
||||
struct generic_product_impl<Lhs, Transpose<Rhs>, MatrixShape, TranspositionsShape, ProductTag>
|
||||
{
|
||||
template<typename Dest>
|
||||
static void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
|
||||
static EIGEN_DEVICE_FUNC void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
|
||||
{
|
||||
transposition_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#ifndef EIGEN_RANDOM_H
|
||||
#define EIGEN_RANDOM_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
@@ -29,16 +29,16 @@ struct functor_traits<scalar_random_op<Scalar> >
|
||||
*
|
||||
* Numbers are uniformly spread through their whole definition range for integer types,
|
||||
* and in the [-1:1] range for floating point scalar types.
|
||||
*
|
||||
*
|
||||
* The parameters \a rows and \a cols are the number of rows and of columns of
|
||||
* the returned matrix. Must be compatible with this MatrixBase type.
|
||||
*
|
||||
* \not_reentrant
|
||||
*
|
||||
*
|
||||
* This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
|
||||
* it is redundant to pass \a rows and \a cols as arguments, so Random() should be used
|
||||
* instead.
|
||||
*
|
||||
*
|
||||
*
|
||||
* Example: \include MatrixBase_random_int_int.cpp
|
||||
* Output: \verbinclude MatrixBase_random_int_int.out
|
||||
@@ -46,7 +46,7 @@ struct functor_traits<scalar_random_op<Scalar> >
|
||||
* This expression has the "evaluate before nesting" flag so that it will be evaluated into
|
||||
* a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
|
||||
* behavior with expressions involving random matrices.
|
||||
*
|
||||
*
|
||||
* See DenseBase::NullaryExpr(Index, const CustomNullaryOp&) for an example using C++11 random generators.
|
||||
*
|
||||
* \sa DenseBase::setRandom(), DenseBase::Random(Index), DenseBase::Random()
|
||||
@@ -93,7 +93,7 @@ DenseBase<Derived>::Random(Index size)
|
||||
*
|
||||
* Numbers are uniformly spread through their whole definition range for integer types,
|
||||
* and in the [-1:1] range for floating point scalar types.
|
||||
*
|
||||
*
|
||||
* This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
|
||||
* need to use the variants taking size arguments.
|
||||
*
|
||||
@@ -103,7 +103,7 @@ DenseBase<Derived>::Random(Index size)
|
||||
* This expression has the "evaluate before nesting" flag so that it will be evaluated into
|
||||
* a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
|
||||
* behavior with expressions involving random matrices.
|
||||
*
|
||||
*
|
||||
* \not_reentrant
|
||||
*
|
||||
* \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random(Index)
|
||||
@@ -119,16 +119,16 @@ DenseBase<Derived>::Random()
|
||||
*
|
||||
* Numbers are uniformly spread through their whole definition range for integer types,
|
||||
* and in the [-1:1] range for floating point scalar types.
|
||||
*
|
||||
*
|
||||
* \not_reentrant
|
||||
*
|
||||
*
|
||||
* Example: \include MatrixBase_setRandom.cpp
|
||||
* Output: \verbinclude MatrixBase_setRandom.out
|
||||
*
|
||||
* \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index)
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline Derived& DenseBase<Derived>::setRandom()
|
||||
EIGEN_DEVICE_FUNC inline Derived& DenseBase<Derived>::setRandom()
|
||||
{
|
||||
return *this = Random(rows(), cols());
|
||||
}
|
||||
@@ -137,7 +137,7 @@ inline Derived& DenseBase<Derived>::setRandom()
|
||||
*
|
||||
* Numbers are uniformly spread through their whole definition range for integer types,
|
||||
* and in the [-1:1] range for floating point scalar types.
|
||||
*
|
||||
*
|
||||
* \only_for_vectors
|
||||
* \not_reentrant
|
||||
*
|
||||
@@ -160,7 +160,7 @@ PlainObjectBase<Derived>::setRandom(Index newSize)
|
||||
* and in the [-1:1] range for floating point scalar types.
|
||||
*
|
||||
* \not_reentrant
|
||||
*
|
||||
*
|
||||
* \param rows the new number of rows
|
||||
* \param cols the new number of columns
|
||||
*
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
#ifndef EIGEN_REDUX_H
|
||||
#define EIGEN_REDUX_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
@@ -60,7 +60,7 @@ public:
|
||||
enum {
|
||||
Unrolling = Cost <= UnrollingLimit ? CompleteUnrolling : NoUnrolling
|
||||
};
|
||||
|
||||
|
||||
#ifdef EIGEN_DEBUG_ASSIGN
|
||||
static void debug()
|
||||
{
|
||||
@@ -128,7 +128,7 @@ template<typename Func, typename Derived, int Start>
|
||||
struct redux_novec_unroller<Func, Derived, Start, 0>
|
||||
{
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
|
||||
};
|
||||
|
||||
@@ -215,7 +215,7 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
|
||||
static Scalar run(const Derived &mat, const Func& func)
|
||||
{
|
||||
const Index size = mat.size();
|
||||
|
||||
|
||||
const Index packetSize = redux_traits<Func, Derived>::PacketSize;
|
||||
const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
|
||||
enum {
|
||||
@@ -336,12 +336,12 @@ class redux_evaluator
|
||||
public:
|
||||
typedef _XprType XprType;
|
||||
EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
|
||||
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename XprType::PacketScalar PacketScalar;
|
||||
typedef typename XprType::PacketReturnType PacketReturnType;
|
||||
|
||||
|
||||
enum {
|
||||
MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
|
||||
MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
|
||||
@@ -353,7 +353,7 @@ public:
|
||||
CoeffReadCost = evaluator<XprType>::CoeffReadCost,
|
||||
Alignment = evaluator<XprType>::Alignment
|
||||
};
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
|
||||
EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
|
||||
EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
|
||||
@@ -375,17 +375,17 @@ public:
|
||||
template<int LoadMode, typename PacketType>
|
||||
PacketType packet(Index index) const
|
||||
{ return m_evaluator.template packet<LoadMode,PacketType>(index); }
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
|
||||
{ return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
|
||||
|
||||
|
||||
template<int LoadMode, typename PacketType>
|
||||
PacketType packetByOuterInner(Index outer, Index inner) const
|
||||
{ return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
|
||||
|
||||
|
||||
const XprType & nestedExpression() const { return m_xpr; }
|
||||
|
||||
|
||||
protected:
|
||||
internal::evaluator<XprType> m_evaluator;
|
||||
const XprType &m_xpr;
|
||||
@@ -407,14 +407,14 @@ protected:
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename Func>
|
||||
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
DenseBase<Derived>::redux(const Func& func) const
|
||||
{
|
||||
eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
|
||||
|
||||
typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
|
||||
ThisEvaluator thisEval(derived());
|
||||
|
||||
|
||||
return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func);
|
||||
}
|
||||
|
||||
@@ -422,7 +422,7 @@ DenseBase<Derived>::redux(const Func& func) const
|
||||
* \warning the result is undefined if \c *this contains NaN.
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
DenseBase<Derived>::minCoeff() const
|
||||
{
|
||||
return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar>());
|
||||
@@ -432,7 +432,7 @@ DenseBase<Derived>::minCoeff() const
|
||||
* \warning the result is undefined if \c *this contains NaN.
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
DenseBase<Derived>::maxCoeff() const
|
||||
{
|
||||
return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar>());
|
||||
@@ -445,7 +445,7 @@ DenseBase<Derived>::maxCoeff() const
|
||||
* \sa trace(), prod(), mean()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
DenseBase<Derived>::sum() const
|
||||
{
|
||||
if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
|
||||
@@ -458,7 +458,7 @@ DenseBase<Derived>::sum() const
|
||||
* \sa trace(), prod(), sum()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
DenseBase<Derived>::mean() const
|
||||
{
|
||||
#ifdef __INTEL_COMPILER
|
||||
@@ -479,7 +479,7 @@ DenseBase<Derived>::mean() const
|
||||
* \sa sum(), mean(), trace()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
DenseBase<Derived>::prod() const
|
||||
{
|
||||
if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
|
||||
@@ -494,7 +494,7 @@ DenseBase<Derived>::prod() const
|
||||
* \sa diagonal(), sum()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
MatrixBase<Derived>::trace() const
|
||||
{
|
||||
return derived().diagonal().sum();
|
||||
|
||||
@@ -28,12 +28,13 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
|
||||
|
||||
template<typename Derived> struct match {
|
||||
enum {
|
||||
IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime,
|
||||
HasDirectAccess = internal::has_direct_access<Derived>::ret,
|
||||
StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
|
||||
StorageOrderMatch = IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
|
||||
InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic)
|
||||
|| int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
|
||||
|| (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
|
||||
OuterStrideMatch = Derived::IsVectorAtCompileTime
|
||||
OuterStrideMatch = IsVectorAtCompileTime
|
||||
|| int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
|
||||
// NOTE, this indirection of evaluator<Derived>::Alignment is needed
|
||||
// to workaround a very strange bug in MSVC related to the instantiation
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#ifndef EIGEN_REPLICATE_H
|
||||
#define EIGEN_REPLICATE_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
template<typename MatrixType,int RowFactor,int ColFactor>
|
||||
@@ -35,7 +35,7 @@ struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
|
||||
IsRowMajor = MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1 ? 1
|
||||
: MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1 ? 0
|
||||
: (MatrixType::Flags & RowMajorBit) ? 1 : 0,
|
||||
|
||||
|
||||
// FIXME enable DirectAccess with negative strides?
|
||||
Flags = IsRowMajor ? RowMajorBit : 0
|
||||
};
|
||||
@@ -95,8 +95,8 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
const _MatrixTypeNested& nestedExpression() const
|
||||
{
|
||||
return m_matrix;
|
||||
{
|
||||
return m_matrix;
|
||||
}
|
||||
|
||||
protected:
|
||||
@@ -115,7 +115,7 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<int RowFactor, int ColFactor>
|
||||
const Replicate<Derived,RowFactor,ColFactor>
|
||||
EIGEN_DEVICE_FUNC const Replicate<Derived,RowFactor,ColFactor>
|
||||
DenseBase<Derived>::replicate() const
|
||||
{
|
||||
return Replicate<Derived,RowFactor,ColFactor>(derived());
|
||||
@@ -130,7 +130,7 @@ DenseBase<Derived>::replicate() const
|
||||
* \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate
|
||||
*/
|
||||
template<typename ExpressionType, int Direction>
|
||||
const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
|
||||
EIGEN_DEVICE_FUNC const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
|
||||
VectorwiseOp<ExpressionType,Direction>::replicate(Index factor) const
|
||||
{
|
||||
return typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
|
||||
|
||||
@@ -79,7 +79,7 @@ template<typename Derived> class ReturnByValue
|
||||
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
|
||||
EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
|
||||
{
|
||||
other.evalTo(derived());
|
||||
return derived();
|
||||
@@ -90,7 +90,7 @@ namespace internal {
|
||||
// Expression is evaluated in a temporary; default implementation of Assignment is bypassed so that
|
||||
// when a ReturnByValue expression is assigned, the evaluator is not constructed.
|
||||
// TODO: Finalize port to new regime; ReturnByValue should not exist in the expression world
|
||||
|
||||
|
||||
template<typename Derived>
|
||||
struct evaluator<ReturnByValue<Derived> >
|
||||
: public evaluator<typename internal::traits<Derived>::ReturnType>
|
||||
@@ -98,7 +98,7 @@ struct evaluator<ReturnByValue<Derived> >
|
||||
typedef ReturnByValue<Derived> XprType;
|
||||
typedef typename internal::traits<Derived>::ReturnType PlainObject;
|
||||
typedef evaluator<PlainObject> Base;
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
|
||||
: m_result(xpr.rows(), xpr.cols())
|
||||
{
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
#ifndef EIGEN_REVERSE_H
|
||||
#define EIGEN_REVERSE_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
@@ -44,7 +44,7 @@ template<typename PacketType> struct reverse_packet_cond<PacketType,false>
|
||||
static inline PacketType run(const PacketType& x) { return x; }
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace internal
|
||||
|
||||
/** \class Reverse
|
||||
* \ingroup Core_Module
|
||||
@@ -98,7 +98,7 @@ template<typename MatrixType, int Direction> class Reverse
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC const typename internal::remove_all<typename MatrixType::Nested>::type&
|
||||
nestedExpression() const
|
||||
nestedExpression() const
|
||||
{
|
||||
return m_matrix;
|
||||
}
|
||||
@@ -114,7 +114,7 @@ template<typename MatrixType, int Direction> class Reverse
|
||||
*
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline typename DenseBase<Derived>::ReverseReturnType
|
||||
EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ReverseReturnType
|
||||
DenseBase<Derived>::reverse()
|
||||
{
|
||||
return ReverseReturnType(derived());
|
||||
@@ -136,7 +136,7 @@ DenseBase<Derived>::reverse()
|
||||
*
|
||||
* \sa VectorwiseOp::reverseInPlace(), reverse() */
|
||||
template<typename Derived>
|
||||
inline void DenseBase<Derived>::reverseInPlace()
|
||||
EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::reverseInPlace()
|
||||
{
|
||||
if(cols()>rows())
|
||||
{
|
||||
@@ -161,7 +161,7 @@ inline void DenseBase<Derived>::reverseInPlace()
|
||||
}
|
||||
|
||||
namespace internal {
|
||||
|
||||
|
||||
template<int Direction>
|
||||
struct vectorwise_reverse_inplace_impl;
|
||||
|
||||
@@ -201,7 +201,7 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
|
||||
*
|
||||
* \sa DenseBase::reverseInPlace(), reverse() */
|
||||
template<typename ExpressionType, int Direction>
|
||||
void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
|
||||
EIGEN_DEVICE_FUNC void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
|
||||
{
|
||||
internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#ifndef EIGEN_SELFADJOINTMATRIX_H
|
||||
#define EIGEN_SELFADJOINTMATRIX_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
/** \class SelfAdjointView
|
||||
* \ingroup Core_Module
|
||||
@@ -58,7 +58,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
|
||||
typedef MatrixTypeNestedCleaned NestedExpression;
|
||||
|
||||
/** \brief The type of coefficients in this matrix */
|
||||
typedef typename internal::traits<SelfAdjointView>::Scalar Scalar;
|
||||
typedef typename internal::traits<SelfAdjointView>::Scalar Scalar;
|
||||
typedef typename MatrixType::StorageIndex StorageIndex;
|
||||
typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
|
||||
|
||||
@@ -131,7 +131,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
|
||||
{
|
||||
return Product<OtherDerived,SelfAdjointView>(lhs.derived(),rhs);
|
||||
}
|
||||
|
||||
|
||||
friend EIGEN_DEVICE_FUNC
|
||||
const SelfAdjointView<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,MatrixType,product),UpLo>
|
||||
operator*(const Scalar& s, const SelfAdjointView& mat)
|
||||
@@ -287,17 +287,17 @@ protected:
|
||||
using Base::m_src;
|
||||
using Base::m_functor;
|
||||
public:
|
||||
|
||||
|
||||
typedef typename Base::DstEvaluatorType DstEvaluatorType;
|
||||
typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
|
||||
typedef typename Base::Scalar Scalar;
|
||||
typedef typename Base::AssignmentTraits AssignmentTraits;
|
||||
|
||||
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
|
||||
: Base(dst, src, func, dstExpr)
|
||||
{}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
|
||||
{
|
||||
eigen_internal_assert(row!=col);
|
||||
@@ -305,12 +305,12 @@ public:
|
||||
m_functor.assignCoeff(m_dst.coeffRef(row,col), tmp);
|
||||
m_functor.assignCoeff(m_dst.coeffRef(col,row), numext::conj(tmp));
|
||||
}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
|
||||
{
|
||||
Base::assignCoeff(id,id);
|
||||
}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index, Index)
|
||||
{ eigen_internal_assert(false && "should never be called"); }
|
||||
};
|
||||
@@ -324,7 +324,7 @@ public:
|
||||
/** This is the const version of MatrixBase::selfadjointView() */
|
||||
template<typename Derived>
|
||||
template<unsigned int UpLo>
|
||||
typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
|
||||
EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
|
||||
MatrixBase<Derived>::selfadjointView() const
|
||||
{
|
||||
return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
|
||||
@@ -341,7 +341,7 @@ MatrixBase<Derived>::selfadjointView() const
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<unsigned int UpLo>
|
||||
typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
|
||||
EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
|
||||
MatrixBase<Derived>::selfadjointView()
|
||||
{
|
||||
return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
namespace Eigen {
|
||||
|
||||
template<typename Decomposition, typename RhsType, typename StorageKind> class SolveImpl;
|
||||
|
||||
|
||||
/** \class Solve
|
||||
* \ingroup Core_Module
|
||||
*
|
||||
@@ -64,11 +64,11 @@ class Solve : public SolveImpl<Decomposition,RhsType,typename internal::traits<R
|
||||
public:
|
||||
typedef typename internal::traits<Solve>::PlainObject PlainObject;
|
||||
typedef typename internal::traits<Solve>::StorageIndex StorageIndex;
|
||||
|
||||
|
||||
Solve(const Decomposition &dec, const RhsType &rhs)
|
||||
: m_dec(dec), m_rhs(rhs)
|
||||
{}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC Index rows() const { return m_dec.cols(); }
|
||||
EIGEN_DEVICE_FUNC Index cols() const { return m_rhs.cols(); }
|
||||
|
||||
@@ -87,14 +87,14 @@ class SolveImpl<Decomposition,RhsType,Dense>
|
||||
: public MatrixBase<Solve<Decomposition,RhsType> >
|
||||
{
|
||||
typedef Solve<Decomposition,RhsType> Derived;
|
||||
|
||||
|
||||
public:
|
||||
|
||||
|
||||
typedef MatrixBase<Solve<Decomposition,RhsType> > Base;
|
||||
EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
|
||||
|
||||
private:
|
||||
|
||||
|
||||
Scalar coeff(Index row, Index col) const;
|
||||
Scalar coeff(Index i) const;
|
||||
};
|
||||
@@ -119,15 +119,15 @@ struct evaluator<Solve<Decomposition,RhsType> >
|
||||
typedef evaluator<PlainObject> Base;
|
||||
|
||||
enum { Flags = Base::Flags | EvalBeforeNestingBit };
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC explicit evaluator(const SolveType& solve)
|
||||
: m_result(solve.rows(), solve.cols())
|
||||
{
|
||||
::new (static_cast<Base*>(this)) Base(m_result);
|
||||
solve.dec()._solve_impl(solve.rhs(), m_result);
|
||||
}
|
||||
|
||||
protected:
|
||||
|
||||
protected:
|
||||
PlainObject m_result;
|
||||
};
|
||||
|
||||
@@ -137,7 +137,7 @@ template<typename DstXprType, typename DecType, typename RhsType, typename Scala
|
||||
struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
|
||||
{
|
||||
typedef Solve<DecType,RhsType> SrcXprType;
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
|
||||
static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
|
||||
{
|
||||
Index dstRows = src.rows();
|
||||
Index dstCols = src.cols();
|
||||
@@ -153,7 +153,7 @@ template<typename DstXprType, typename DecType, typename RhsType, typename Scala
|
||||
struct Assignment<DstXprType, Solve<Transpose<const DecType>,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
|
||||
{
|
||||
typedef Solve<Transpose<const DecType>,RhsType> SrcXprType;
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
|
||||
static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
|
||||
{
|
||||
Index dstRows = src.rows();
|
||||
Index dstCols = src.cols();
|
||||
@@ -170,13 +170,13 @@ struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<t
|
||||
internal::assign_op<Scalar,Scalar>, Dense2Dense>
|
||||
{
|
||||
typedef Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType> SrcXprType;
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
|
||||
static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
|
||||
{
|
||||
Index dstRows = src.rows();
|
||||
Index dstCols = src.cols();
|
||||
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
|
||||
dst.resize(dstRows, dstCols);
|
||||
|
||||
|
||||
src.dec().nestedExpression().nestedExpression().template _solve_impl_transposed<true>(src.rhs(), dst);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#ifndef EIGEN_SOLVETRIANGULAR_H
|
||||
#define EIGEN_SOLVETRIANGULAR_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
@@ -19,7 +19,7 @@ namespace internal {
|
||||
template<typename LhsScalar, typename RhsScalar, typename Index, int Side, int Mode, bool Conjugate, int StorageOrder>
|
||||
struct triangular_solve_vector;
|
||||
|
||||
template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherStorageOrder>
|
||||
template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherStorageOrder, int OtherInnerStride>
|
||||
struct triangular_solve_matrix;
|
||||
|
||||
// small helper struct extracting some traits on the underlying solver operation
|
||||
@@ -64,7 +64,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,1>
|
||||
|
||||
ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhs,rhs.size(),
|
||||
(useRhsDirectly ? rhs.data() : 0));
|
||||
|
||||
|
||||
if(!useRhsDirectly)
|
||||
MappedRhs(actualRhs,rhs.size()) = rhs;
|
||||
|
||||
@@ -98,8 +98,8 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
|
||||
BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);
|
||||
|
||||
triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
|
||||
(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
|
||||
::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.outerStride(), blocking);
|
||||
(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor, Rhs::InnerStrideAtCompileTime>
|
||||
::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.innerStride(), rhs.outerStride(), blocking);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -148,7 +148,7 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
|
||||
{
|
||||
Transpose<const Lhs> trLhs(lhs);
|
||||
Transpose<Rhs> trRhs(rhs);
|
||||
|
||||
|
||||
triangular_solver_unroller<Transpose<const Lhs>,Transpose<Rhs>,
|
||||
((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag),
|
||||
0,Rhs::SizeAtCompileTime>::run(trLhs,trRhs);
|
||||
@@ -164,7 +164,7 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
template<typename MatrixType, unsigned int Mode>
|
||||
template<int Side, typename OtherDerived>
|
||||
void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
|
||||
EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
|
||||
{
|
||||
OtherDerived& other = _other.const_cast_derived();
|
||||
eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
|
||||
@@ -187,7 +187,7 @@ void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<Ot
|
||||
|
||||
template<typename Derived, unsigned int Mode>
|
||||
template<int Side, typename Other>
|
||||
const internal::triangular_solve_retval<Side,TriangularView<Derived,Mode>,Other>
|
||||
EIGEN_DEVICE_FUNC const internal::triangular_solve_retval<Side,TriangularView<Derived,Mode>,Other>
|
||||
TriangularViewImpl<Derived,Mode,Dense>::solve(const MatrixBase<Other>& other) const
|
||||
{
|
||||
return internal::triangular_solve_retval<Side,TriangularViewType,Other>(derived(), other.derived());
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
#ifndef EIGEN_TRANSPOSE_H
|
||||
#define EIGEN_TRANSPOSE_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
template<typename MatrixType>
|
||||
@@ -146,6 +146,8 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
|
||||
{
|
||||
return derived().nestedExpression().coeffRef(index);
|
||||
}
|
||||
protected:
|
||||
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TransposeImpl)
|
||||
};
|
||||
|
||||
/** \returns an expression of the transpose of *this.
|
||||
@@ -168,7 +170,7 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
|
||||
*
|
||||
* \sa transposeInPlace(), adjoint() */
|
||||
template<typename Derived>
|
||||
inline Transpose<Derived>
|
||||
EIGEN_DEVICE_FUNC inline Transpose<Derived>
|
||||
DenseBase<Derived>::transpose()
|
||||
{
|
||||
return TransposeReturnType(derived());
|
||||
@@ -180,7 +182,7 @@ DenseBase<Derived>::transpose()
|
||||
*
|
||||
* \sa transposeInPlace(), adjoint() */
|
||||
template<typename Derived>
|
||||
inline typename DenseBase<Derived>::ConstTransposeReturnType
|
||||
EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ConstTransposeReturnType
|
||||
DenseBase<Derived>::transpose() const
|
||||
{
|
||||
return ConstTransposeReturnType(derived());
|
||||
@@ -206,7 +208,7 @@ DenseBase<Derived>::transpose() const
|
||||
*
|
||||
* \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */
|
||||
template<typename Derived>
|
||||
inline const typename MatrixBase<Derived>::AdjointReturnType
|
||||
EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::AdjointReturnType
|
||||
MatrixBase<Derived>::adjoint() const
|
||||
{
|
||||
return AdjointReturnType(this->transpose());
|
||||
@@ -276,12 +278,12 @@ struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non squ
|
||||
* Notice however that this method is only useful if you want to replace a matrix by its own transpose.
|
||||
* If you just need the transpose of a matrix, use transpose().
|
||||
*
|
||||
* \note if the matrix is not square, then \c *this must be a resizable matrix.
|
||||
* \note if the matrix is not square, then \c *this must be a resizable matrix.
|
||||
* This excludes (non-square) fixed-size matrices, block-expressions and maps.
|
||||
*
|
||||
* \sa transpose(), adjoint(), adjointInPlace() */
|
||||
template<typename Derived>
|
||||
inline void DenseBase<Derived>::transposeInPlace()
|
||||
EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::transposeInPlace()
|
||||
{
|
||||
eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic))
|
||||
&& "transposeInPlace() called on a non-square non-resizable matrix");
|
||||
@@ -312,7 +314,7 @@ inline void DenseBase<Derived>::transposeInPlace()
|
||||
*
|
||||
* \sa transpose(), adjoint(), transposeInPlace() */
|
||||
template<typename Derived>
|
||||
inline void MatrixBase<Derived>::adjointInPlace()
|
||||
EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::adjointInPlace()
|
||||
{
|
||||
derived() = adjoint().eval();
|
||||
}
|
||||
|
||||
@@ -33,17 +33,6 @@ class TranspositionsBase
|
||||
indices() = other.indices();
|
||||
return derived();
|
||||
}
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
/** This is a special case of the templated operator=. Its purpose is to
|
||||
* prevent a default operator= from hiding the templated operator=.
|
||||
*/
|
||||
Derived& operator=(const TranspositionsBase& other)
|
||||
{
|
||||
indices() = other.indices();
|
||||
return derived();
|
||||
}
|
||||
#endif
|
||||
|
||||
/** \returns the number of transpositions */
|
||||
Index size() const { return indices().size(); }
|
||||
@@ -171,12 +160,6 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
|
||||
inline Transpositions(const TranspositionsBase<OtherDerived>& other)
|
||||
: m_indices(other.indices()) {}
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
/** Standard copy constructor. Defined only to prevent a default copy constructor
|
||||
* from hiding the other templated constructor */
|
||||
inline Transpositions(const Transpositions& other) : m_indices(other.indices()) {}
|
||||
#endif
|
||||
|
||||
/** Generic constructor from expression of the transposition indices. */
|
||||
template<typename Other>
|
||||
explicit inline Transpositions(const MatrixBase<Other>& indices) : m_indices(indices)
|
||||
@@ -189,17 +172,6 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
|
||||
return Base::operator=(other);
|
||||
}
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
/** This is a special case of the templated operator=. Its purpose is to
|
||||
* prevent a default operator= from hiding the templated operator=.
|
||||
*/
|
||||
Transpositions& operator=(const Transpositions& other)
|
||||
{
|
||||
m_indices = other.m_indices;
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
|
||||
/** Constructs an uninitialized permutation matrix of given size.
|
||||
*/
|
||||
inline Transpositions(Index size) : m_indices(size)
|
||||
@@ -306,17 +278,6 @@ class TranspositionsWrapper
|
||||
return Base::operator=(other);
|
||||
}
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
/** This is a special case of the templated operator=. Its purpose is to
|
||||
* prevent a default operator= from hiding the templated operator=.
|
||||
*/
|
||||
TranspositionsWrapper& operator=(const TranspositionsWrapper& other)
|
||||
{
|
||||
m_indices = other.m_indices;
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
|
||||
/** const version of indices(). */
|
||||
const IndicesType& indices() const { return m_indices; }
|
||||
|
||||
|
||||
@@ -11,12 +11,12 @@
|
||||
#ifndef EIGEN_TRIANGULARMATRIX_H
|
||||
#define EIGEN_TRIANGULARMATRIX_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
|
||||
template<int Side, typename TriangularType, typename Rhs> struct triangular_solve_retval;
|
||||
|
||||
|
||||
}
|
||||
|
||||
/** \class TriangularBase
|
||||
@@ -34,16 +34,16 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
|
||||
ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
|
||||
MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
|
||||
MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
|
||||
|
||||
|
||||
SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
|
||||
internal::traits<Derived>::ColsAtCompileTime>::ret),
|
||||
/**< This is equal to the number of coefficients, i.e. the number of
|
||||
* rows times the number of columns, or to \a Dynamic if this is not
|
||||
* known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
|
||||
|
||||
|
||||
MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
|
||||
internal::traits<Derived>::MaxColsAtCompileTime>::ret)
|
||||
|
||||
|
||||
};
|
||||
typedef typename internal::traits<Derived>::Scalar Scalar;
|
||||
typedef typename internal::traits<Derived>::StorageKind StorageKind;
|
||||
@@ -63,7 +63,7 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
|
||||
inline Index outerStride() const { return derived().outerStride(); }
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline Index innerStride() const { return derived().innerStride(); }
|
||||
|
||||
|
||||
// dummy resize function
|
||||
void resize(Index rows, Index cols)
|
||||
{
|
||||
@@ -155,7 +155,7 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
|
||||
* \param MatrixType the type of the object in which we are taking the triangular part
|
||||
* \param Mode the kind of triangular matrix expression to construct. Can be #Upper,
|
||||
* #Lower, #UnitUpper, #UnitLower, #StrictlyUpper, or #StrictlyLower.
|
||||
* This is in fact a bit field; it must have either #Upper or #Lower,
|
||||
* This is in fact a bit field; it must have either #Upper or #Lower,
|
||||
* and additionally it may have #UnitDiag or #ZeroDiag or neither.
|
||||
*
|
||||
* This class represents a triangular part of a matrix, not necessarily square. Strictly speaking, for rectangular
|
||||
@@ -197,7 +197,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
|
||||
typedef typename internal::traits<TriangularView>::MatrixTypeNestedNonRef MatrixTypeNestedNonRef;
|
||||
|
||||
typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
|
||||
|
||||
|
||||
public:
|
||||
|
||||
typedef typename internal::traits<TriangularView>::StorageKind StorageKind;
|
||||
@@ -216,10 +216,8 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
|
||||
EIGEN_DEVICE_FUNC
|
||||
explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix)
|
||||
{}
|
||||
|
||||
using Base::operator=;
|
||||
TriangularView& operator=(const TriangularView &other)
|
||||
{ return Base::operator=(other); }
|
||||
|
||||
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView)
|
||||
|
||||
/** \copydoc EigenBase::rows() */
|
||||
EIGEN_DEVICE_FUNC
|
||||
@@ -235,7 +233,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
|
||||
/** \returns a reference to the nested expression */
|
||||
EIGEN_DEVICE_FUNC
|
||||
NestedExpression& nestedExpression() { return m_matrix; }
|
||||
|
||||
|
||||
typedef TriangularView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
|
||||
/** \sa MatrixBase::conjugate() const */
|
||||
EIGEN_DEVICE_FUNC
|
||||
@@ -257,7 +255,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
|
||||
typename MatrixType::TransposeReturnType tmp(m_matrix);
|
||||
return TransposeReturnType(tmp);
|
||||
}
|
||||
|
||||
|
||||
typedef TriangularView<const typename MatrixType::ConstTransposeReturnType,TransposeMode> ConstTransposeReturnType;
|
||||
/** \sa MatrixBase::transpose() const */
|
||||
EIGEN_DEVICE_FUNC
|
||||
@@ -268,10 +266,10 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
|
||||
|
||||
template<typename Other>
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const Solve<TriangularView, Other>
|
||||
inline const Solve<TriangularView, Other>
|
||||
solve(const MatrixBase<Other>& other) const
|
||||
{ return Solve<TriangularView, Other>(*this, other.derived()); }
|
||||
|
||||
|
||||
// workaround MSVC ICE
|
||||
#if EIGEN_COMP_MSVC
|
||||
template<int Side, typename Other>
|
||||
@@ -315,7 +313,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
|
||||
else
|
||||
return m_matrix.diagonal().prod();
|
||||
}
|
||||
|
||||
|
||||
protected:
|
||||
|
||||
MatrixTypeNested m_matrix;
|
||||
@@ -377,7 +375,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
|
||||
internal::call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op<Scalar,typename Other::Scalar>());
|
||||
return derived();
|
||||
}
|
||||
|
||||
|
||||
/** \sa MatrixBase::operator*=() */
|
||||
EIGEN_DEVICE_FUNC
|
||||
TriangularViewType& operator*=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = derived().nestedExpression() * other; }
|
||||
@@ -544,6 +542,10 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
|
||||
template<typename ProductType>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha, bool beta);
|
||||
protected:
|
||||
EIGEN_DEFAULT_COPY_CONSTRUCTOR(TriangularViewImpl)
|
||||
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TriangularViewImpl)
|
||||
|
||||
};
|
||||
|
||||
/***************************************************************************
|
||||
@@ -554,7 +556,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
|
||||
// FIXME should we keep that possibility
|
||||
template<typename MatrixType, unsigned int Mode>
|
||||
template<typename OtherDerived>
|
||||
inline TriangularView<MatrixType, Mode>&
|
||||
EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
|
||||
TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDerived>& other)
|
||||
{
|
||||
internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
|
||||
@@ -564,7 +566,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDer
|
||||
// FIXME should we keep that possibility
|
||||
template<typename MatrixType, unsigned int Mode>
|
||||
template<typename OtherDerived>
|
||||
void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
|
||||
EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
|
||||
{
|
||||
internal::call_assignment_no_alias(derived(), other.template triangularView<Mode>());
|
||||
}
|
||||
@@ -573,7 +575,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<Ot
|
||||
|
||||
template<typename MatrixType, unsigned int Mode>
|
||||
template<typename OtherDerived>
|
||||
inline TriangularView<MatrixType, Mode>&
|
||||
EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
|
||||
TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<OtherDerived>& other)
|
||||
{
|
||||
eigen_assert(Mode == int(OtherDerived::Mode));
|
||||
@@ -583,7 +585,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<Othe
|
||||
|
||||
template<typename MatrixType, unsigned int Mode>
|
||||
template<typename OtherDerived>
|
||||
void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
|
||||
EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
|
||||
{
|
||||
eigen_assert(Mode == int(OtherDerived::Mode));
|
||||
internal::call_assignment_no_alias(derived(), other.derived());
|
||||
@@ -598,7 +600,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBas
|
||||
* If the matrix is triangular, the opposite part is set to zero. */
|
||||
template<typename Derived>
|
||||
template<typename DenseDerived>
|
||||
void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
|
||||
EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
|
||||
{
|
||||
evalToLazy(other.derived());
|
||||
}
|
||||
@@ -624,7 +626,7 @@ void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<unsigned int Mode>
|
||||
typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
|
||||
EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
|
||||
MatrixBase<Derived>::triangularView()
|
||||
{
|
||||
return typename TriangularViewReturnType<Mode>::Type(derived());
|
||||
@@ -633,7 +635,7 @@ MatrixBase<Derived>::triangularView()
|
||||
/** This is the const version of MatrixBase::triangularView() */
|
||||
template<typename Derived>
|
||||
template<unsigned int Mode>
|
||||
typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
|
||||
EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
|
||||
MatrixBase<Derived>::triangularView() const
|
||||
{
|
||||
return typename ConstTriangularViewReturnType<Mode>::Type(derived());
|
||||
@@ -698,7 +700,7 @@ bool MatrixBase<Derived>::isLowerTriangular(const RealScalar& prec) const
|
||||
|
||||
namespace internal {
|
||||
|
||||
|
||||
|
||||
// TODO currently a triangular expression has the form TriangularView<.,.>
|
||||
// in the future triangular-ness should be defined by the expression traits
|
||||
// such that Transpose<TriangularView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
|
||||
@@ -726,7 +728,7 @@ struct Dense2Triangular {};
|
||||
|
||||
template<typename Kernel, unsigned int Mode, int UnrollCount, bool ClearOpposite> struct triangular_assignment_loop;
|
||||
|
||||
|
||||
|
||||
/** \internal Specialization of the dense assignment kernel for triangular matrices.
|
||||
* The main difference is that the triangular, diagonal, and opposite parts are processed through three different functions.
|
||||
* \tparam UpLo must be either Lower or Upper
|
||||
@@ -743,17 +745,17 @@ protected:
|
||||
using Base::m_src;
|
||||
using Base::m_functor;
|
||||
public:
|
||||
|
||||
|
||||
typedef typename Base::DstEvaluatorType DstEvaluatorType;
|
||||
typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
|
||||
typedef typename Base::Scalar Scalar;
|
||||
typedef typename Base::AssignmentTraits AssignmentTraits;
|
||||
|
||||
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
|
||||
: Base(dst, src, func, dstExpr)
|
||||
{}
|
||||
|
||||
|
||||
#ifdef EIGEN_INTERNAL_DEBUGGING
|
||||
EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
|
||||
{
|
||||
@@ -763,16 +765,16 @@ public:
|
||||
#else
|
||||
using Base::assignCoeff;
|
||||
#endif
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
|
||||
{
|
||||
if(Mode==UnitDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(1));
|
||||
else if(Mode==ZeroDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(0));
|
||||
else if(Mode==0) Base::assignCoeff(id,id);
|
||||
}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index row, Index col)
|
||||
{
|
||||
{
|
||||
eigen_internal_assert(row!=col);
|
||||
if(SetOpposite)
|
||||
m_functor.assignCoeff(m_dst.coeffRef(row,col), Scalar(0));
|
||||
@@ -793,17 +795,17 @@ void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src, con
|
||||
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
|
||||
dst.resize(dstRows, dstCols);
|
||||
DstEvaluatorType dstEvaluator(dst);
|
||||
|
||||
|
||||
typedef triangular_dense_assignment_kernel< Mode&(Lower|Upper),Mode&(UnitDiag|ZeroDiag|SelfAdjoint),SetOpposite,
|
||||
DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
|
||||
Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
|
||||
|
||||
|
||||
enum {
|
||||
unroll = DstXprType::SizeAtCompileTime != Dynamic
|
||||
&& SrcEvaluatorType::CoeffReadCost < HugeCost
|
||||
&& DstXprType::SizeAtCompileTime * (DstEvaluatorType::CoeffReadCost+SrcEvaluatorType::CoeffReadCost) / 2 <= EIGEN_UNROLLING_LIMIT
|
||||
};
|
||||
|
||||
|
||||
triangular_assignment_loop<Kernel, Mode, unroll ? int(DstXprType::SizeAtCompileTime) : Dynamic, SetOpposite>::run(kernel);
|
||||
}
|
||||
|
||||
@@ -825,8 +827,8 @@ struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Triangular>
|
||||
EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
|
||||
{
|
||||
eigen_assert(int(DstXprType::Mode) == int(SrcXprType::Mode));
|
||||
|
||||
call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);
|
||||
|
||||
call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -835,7 +837,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense>
|
||||
{
|
||||
EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
|
||||
{
|
||||
call_triangular_assignment_loop<SrcXprType::Mode, (SrcXprType::Mode&SelfAdjoint)==0>(dst, src, func);
|
||||
call_triangular_assignment_loop<SrcXprType::Mode, (SrcXprType::Mode&SelfAdjoint)==0>(dst, src, func);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -844,7 +846,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, Dense2Triangular>
|
||||
{
|
||||
EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
|
||||
{
|
||||
call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);
|
||||
call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -855,19 +857,19 @@ struct triangular_assignment_loop
|
||||
// FIXME: this is not very clean, perhaps this information should be provided by the kernel?
|
||||
typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
|
||||
typedef typename DstEvaluatorType::XprType DstXprType;
|
||||
|
||||
|
||||
enum {
|
||||
col = (UnrollCount-1) / DstXprType::RowsAtCompileTime,
|
||||
row = (UnrollCount-1) % DstXprType::RowsAtCompileTime
|
||||
};
|
||||
|
||||
|
||||
typedef typename Kernel::Scalar Scalar;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline void run(Kernel &kernel)
|
||||
{
|
||||
triangular_assignment_loop<Kernel, Mode, UnrollCount-1, SetOpposite>::run(kernel);
|
||||
|
||||
|
||||
if(row==col)
|
||||
kernel.assignDiagonalCoeff(row);
|
||||
else if( ((Mode&Lower) && row>col) || ((Mode&Upper) && row<col) )
|
||||
@@ -910,10 +912,10 @@ struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
|
||||
}
|
||||
else
|
||||
i = maxi;
|
||||
|
||||
|
||||
if(i<kernel.rows()) // then i==j
|
||||
kernel.assignDiagonalCoeff(i++);
|
||||
|
||||
|
||||
if (((Mode&Upper) && SetOpposite) || (Mode&Lower))
|
||||
{
|
||||
for(; i < kernel.rows(); ++i)
|
||||
@@ -930,20 +932,20 @@ struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
|
||||
* If the matrix is triangular, the opposite part is set to zero. */
|
||||
template<typename Derived>
|
||||
template<typename DenseDerived>
|
||||
void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
|
||||
EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
|
||||
{
|
||||
other.derived().resize(this->rows(), this->cols());
|
||||
internal::call_triangular_assignment_loop<Derived::Mode,(Derived::Mode&SelfAdjoint)==0 /* SetOpposite */>(other.derived(), derived().nestedExpression());
|
||||
}
|
||||
|
||||
namespace internal {
|
||||
|
||||
|
||||
// Triangular = Product
|
||||
template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
|
||||
struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
|
||||
{
|
||||
typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename SrcXprType::Scalar> &)
|
||||
static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename SrcXprType::Scalar> &)
|
||||
{
|
||||
Index dstRows = src.rows();
|
||||
Index dstCols = src.cols();
|
||||
@@ -959,7 +961,7 @@ template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
|
||||
struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
|
||||
{
|
||||
typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,typename SrcXprType::Scalar> &)
|
||||
static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,typename SrcXprType::Scalar> &)
|
||||
{
|
||||
dst._assignProduct(src, 1, 1);
|
||||
}
|
||||
@@ -970,7 +972,7 @@ template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
|
||||
struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::sub_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
|
||||
{
|
||||
typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,typename SrcXprType::Scalar> &)
|
||||
static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,typename SrcXprType::Scalar> &)
|
||||
{
|
||||
dst._assignProduct(src, -1, 1);
|
||||
}
|
||||
|
||||
@@ -670,7 +670,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
|
||||
* \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline typename DenseBase<Derived>::ColwiseReturnType
|
||||
EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ColwiseReturnType
|
||||
DenseBase<Derived>::colwise()
|
||||
{
|
||||
return ColwiseReturnType(derived());
|
||||
@@ -684,7 +684,7 @@ DenseBase<Derived>::colwise()
|
||||
* \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline typename DenseBase<Derived>::RowwiseReturnType
|
||||
EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::RowwiseReturnType
|
||||
DenseBase<Derived>::rowwise()
|
||||
{
|
||||
return RowwiseReturnType(derived());
|
||||
|
||||
@@ -29,6 +29,7 @@ namespace internal {
|
||||
#define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
|
||||
const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))
|
||||
|
||||
|
||||
// Natural logarithm
|
||||
// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
|
||||
// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
|
||||
@@ -47,6 +48,7 @@ plog<Packet16f>(const Packet16f& _x) {
|
||||
// The smallest non denormalized float number.
|
||||
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
|
||||
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
|
||||
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000);
|
||||
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
|
||||
|
||||
// Polynomial coefficients.
|
||||
@@ -64,11 +66,9 @@ plog<Packet16f>(const Packet16f& _x) {
|
||||
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f);
|
||||
|
||||
// invalid_mask is set to true when x is NaN
|
||||
__mmask16 invalid_mask =
|
||||
_mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
|
||||
__mmask16 iszero_mask =
|
||||
_mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_UQ);
|
||||
|
||||
__mmask16 invalid_mask = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
|
||||
__mmask16 iszero_mask = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_OQ);
|
||||
|
||||
// Truncate input values to the minimum positive normal.
|
||||
x = pmax(x, p16f_min_norm_pos);
|
||||
|
||||
@@ -118,11 +118,18 @@ plog<Packet16f>(const Packet16f& _x) {
|
||||
x = padd(x, y);
|
||||
x = padd(x, y2);
|
||||
|
||||
// Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
|
||||
__mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ);
|
||||
// Filter out invalid inputs, i.e.:
|
||||
// - negative arg will be NAN,
|
||||
// - 0 will be -INF.
|
||||
// - +INF will be +INF
|
||||
return _mm512_mask_blend_ps(iszero_mask,
|
||||
_mm512_mask_blend_ps(invalid_mask, x, p16f_nan),
|
||||
p16f_minus_inf);
|
||||
_mm512_mask_blend_ps(invalid_mask,
|
||||
_mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf),
|
||||
p16f_nan),
|
||||
p16f_minus_inf);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// Exponential function. Works by writing "x = m*log(2) + r" where
|
||||
@@ -258,48 +265,39 @@ pexp<Packet8d>(const Packet8d& _x) {
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
|
||||
psqrt<Packet16f>(const Packet16f& _x) {
|
||||
_EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
|
||||
_EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
|
||||
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
|
||||
Packet16f neg_half = pmul(_x, pset1<Packet16f>(-.5f));
|
||||
__mmask16 denormal_mask = _mm512_kand(
|
||||
_mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()),
|
||||
_CMP_LT_OQ),
|
||||
_mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));
|
||||
|
||||
Packet16f neg_half = pmul(_x, p16f_minus_half);
|
||||
|
||||
// select only the inverse sqrt of positive normal inputs (denormals are
|
||||
// flushed to zero and cause infs as well).
|
||||
__mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
|
||||
Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_setzero_ps(), _mm512_rsqrt14_ps(_x));
|
||||
Packet16f x = _mm512_rsqrt14_ps(_x);
|
||||
|
||||
// Do a single step of Newton's iteration.
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet16f>(1.5f)));
|
||||
|
||||
// Multiply the original _x by it's reciprocal square root to extract the
|
||||
// square root.
|
||||
return pmul(_x, x);
|
||||
// Flush results for denormals to zero.
|
||||
return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps());
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
|
||||
psqrt<Packet8d>(const Packet8d& _x) {
|
||||
_EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
|
||||
_EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
|
||||
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
|
||||
Packet8d neg_half = pmul(_x, pset1<Packet8d>(-.5));
|
||||
__mmask16 denormal_mask = _mm512_kand(
|
||||
_mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()),
|
||||
_CMP_LT_OQ),
|
||||
_mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));
|
||||
|
||||
Packet8d neg_half = pmul(_x, p8d_minus_half);
|
||||
Packet8d x = _mm512_rsqrt14_pd(_x);
|
||||
|
||||
// select only the inverse sqrt of positive normal inputs (denormals are
|
||||
// flushed to zero and cause infs as well).
|
||||
__mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
|
||||
Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_setzero_pd(), _mm512_rsqrt14_pd(_x));
|
||||
|
||||
// Do a first step of Newton's iteration.
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
|
||||
// Do a single step of Newton's iteration.
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
|
||||
|
||||
// Do a second step of Newton's iteration.
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
|
||||
|
||||
// Multiply the original _x by it's reciprocal square root to extract the
|
||||
// square root.
|
||||
return pmul(_x, x);
|
||||
return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd());
|
||||
}
|
||||
#else
|
||||
template <>
|
||||
|
||||
@@ -19,10 +19,10 @@ namespace internal {
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
|
||||
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
|
||||
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
|
||||
#endif
|
||||
|
||||
#ifdef __FMA__
|
||||
#ifdef EIGEN_VECTORIZE_FMA
|
||||
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
||||
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
||||
#endif
|
||||
@@ -54,13 +54,14 @@ template<> struct packet_traits<float> : default_packet_traits
|
||||
AlignedOnScalar = 1,
|
||||
size = 16,
|
||||
HasHalfPacket = 1,
|
||||
#if EIGEN_GNUC_AT_LEAST(5, 3)
|
||||
HasBlend = 0,
|
||||
#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
HasLog = 1,
|
||||
#endif
|
||||
HasExp = 1,
|
||||
HasSqrt = 1,
|
||||
HasRsqrt = 1,
|
||||
HasSqrt = EIGEN_FAST_MATH,
|
||||
HasRsqrt = EIGEN_FAST_MATH,
|
||||
#endif
|
||||
HasDiv = 1
|
||||
};
|
||||
@@ -74,8 +75,8 @@ template<> struct packet_traits<double> : default_packet_traits
|
||||
AlignedOnScalar = 1,
|
||||
size = 8,
|
||||
HasHalfPacket = 1,
|
||||
#if EIGEN_GNUC_AT_LEAST(5, 3)
|
||||
HasSqrt = 1,
|
||||
#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
|
||||
HasSqrt = EIGEN_FAST_MATH,
|
||||
HasRsqrt = EIGEN_FAST_MATH,
|
||||
#endif
|
||||
HasDiv = 1
|
||||
@@ -98,6 +99,7 @@ template <>
|
||||
struct unpacket_traits<Packet16f> {
|
||||
typedef float type;
|
||||
typedef Packet8f half;
|
||||
typedef Packet16i integer_packet;
|
||||
enum { size = 16, alignment=Aligned64 };
|
||||
};
|
||||
template <>
|
||||
@@ -132,7 +134,7 @@ EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) {
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pload1<Packet8d>(const double* from) {
|
||||
return _mm512_broadcastsd_pd(_mm_load_pd1(from));
|
||||
return _mm512_set1_pd(*from);
|
||||
}
|
||||
|
||||
template <>
|
||||
@@ -158,6 +160,11 @@ EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a,
|
||||
const Packet8d& b) {
|
||||
return _mm512_add_pd(a, b);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16i padd<Packet16i>(const Packet16i& a,
|
||||
const Packet16i& b) {
|
||||
return _mm512_add_epi32(a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a,
|
||||
@@ -169,6 +176,11 @@ EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a,
|
||||
const Packet8d& b) {
|
||||
return _mm512_sub_pd(a, b);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a,
|
||||
const Packet16i& b) {
|
||||
return _mm512_sub_epi32(a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
|
||||
@@ -202,6 +214,11 @@ EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a,
|
||||
const Packet8d& b) {
|
||||
return _mm512_mul_pd(a, b);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a,
|
||||
const Packet16i& b) {
|
||||
return _mm512_mul_epi32(a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a,
|
||||
@@ -214,7 +231,7 @@ EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a,
|
||||
return _mm512_div_pd(a, b);
|
||||
}
|
||||
|
||||
#ifdef __FMA__
|
||||
#ifdef EIGEN_VECTORIZE_FMA
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b,
|
||||
const Packet16f& c) {
|
||||
@@ -230,23 +247,73 @@ EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b,
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a,
|
||||
const Packet16f& b) {
|
||||
return _mm512_min_ps(a, b);
|
||||
// Arguments are reversed to match NaN propagation behavior of std::min.
|
||||
return _mm512_min_ps(b, a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a,
|
||||
const Packet8d& b) {
|
||||
return _mm512_min_pd(a, b);
|
||||
// Arguments are reversed to match NaN propagation behavior of std::min.
|
||||
return _mm512_min_pd(b, a);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a,
|
||||
const Packet16f& b) {
|
||||
return _mm512_max_ps(a, b);
|
||||
// Arguments are reversed to match NaN propagation behavior of std::max.
|
||||
return _mm512_max_ps(b, a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a,
|
||||
const Packet8d& b) {
|
||||
return _mm512_max_pd(a, b);
|
||||
// Arguments are reversed to match NaN propagation behavior of std::max.
|
||||
return _mm512_max_pd(b, a);
|
||||
}
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); }
|
||||
template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); }
|
||||
EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); }
|
||||
#else
|
||||
// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
|
||||
template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
|
||||
return _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_));
|
||||
}
|
||||
|
||||
// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512
|
||||
template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
|
||||
return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_));
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
|
||||
return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)),
|
||||
_mm256_castps_si256(b),1));
|
||||
}
|
||||
#endif
|
||||
|
||||
// Helper function for bit packing snippet of low precision comparison.
|
||||
// It packs the flags from 32x16 to 16x16.
|
||||
EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf) {
|
||||
// Split data into small pieces and handle with AVX instructions
|
||||
// to guarantee internal order of vector.
|
||||
// Operation:
|
||||
// dst[15:0] := Saturate16(rf[31:0])
|
||||
// dst[31:16] := Saturate16(rf[63:32])
|
||||
// ...
|
||||
// dst[255:240] := Saturate16(rf[255:224])
|
||||
__m256i lo = _mm256_castps_si256(extract256<0>(rf));
|
||||
__m256i hi = _mm256_castps_si256(extract256<1>(rf));
|
||||
__m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0),
|
||||
_mm256_extractf128_si256(lo, 1));
|
||||
__m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0),
|
||||
_mm256_extractf128_si256(hi, 1));
|
||||
return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a,
|
||||
const Packet16i& b) {
|
||||
return _mm512_and_si512(a,b);
|
||||
}
|
||||
|
||||
template <>
|
||||
@@ -255,24 +322,7 @@ EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a,
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
return _mm512_and_ps(a, b);
|
||||
#else
|
||||
Packet16f res = _mm512_undefined_ps();
|
||||
Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
|
||||
Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
|
||||
res = _mm512_insertf32x4(res, _mm_and_ps(lane0_a, lane0_b), 0);
|
||||
|
||||
Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
|
||||
Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
|
||||
res = _mm512_insertf32x4(res, _mm_and_ps(lane1_a, lane1_b), 1);
|
||||
|
||||
Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
|
||||
Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
|
||||
res = _mm512_insertf32x4(res, _mm_and_ps(lane2_a, lane2_b), 2);
|
||||
|
||||
Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
|
||||
Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
|
||||
res = _mm512_insertf32x4(res, _mm_and_ps(lane3_a, lane3_b), 3);
|
||||
|
||||
return res;
|
||||
return _mm512_castsi512_ps(pand(_mm512_castps_si512(a),_mm512_castps_si512(b)));
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
@@ -288,35 +338,21 @@ EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a,
|
||||
|
||||
Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
|
||||
Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
|
||||
res = _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1);
|
||||
|
||||
return res;
|
||||
return _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a,
|
||||
const Packet16f& b) {
|
||||
EIGEN_STRONG_INLINE Packet16i por<Packet16i>(const Packet16i& a, const Packet16i& b) {
|
||||
return _mm512_or_si512(a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a, const Packet16f& b) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
return _mm512_or_ps(a, b);
|
||||
#else
|
||||
Packet16f res = _mm512_undefined_ps();
|
||||
Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
|
||||
Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
|
||||
res = _mm512_insertf32x4(res, _mm_or_ps(lane0_a, lane0_b), 0);
|
||||
|
||||
Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
|
||||
Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
|
||||
res = _mm512_insertf32x4(res, _mm_or_ps(lane1_a, lane1_b), 1);
|
||||
|
||||
Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
|
||||
Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
|
||||
res = _mm512_insertf32x4(res, _mm_or_ps(lane2_a, lane2_b), 2);
|
||||
|
||||
Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
|
||||
Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
|
||||
res = _mm512_insertf32x4(res, _mm_or_ps(lane3_a, lane3_b), 3);
|
||||
|
||||
return res;
|
||||
return _mm512_castsi512_ps(por(_mm512_castps_si512(a),_mm512_castps_si512(b)));
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -326,109 +362,67 @@ EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a,
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
return _mm512_or_pd(a, b);
|
||||
#else
|
||||
Packet8d res = _mm512_undefined_pd();
|
||||
Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
|
||||
Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
|
||||
res = _mm512_insertf64x4(res, _mm256_or_pd(lane0_a, lane0_b), 0);
|
||||
|
||||
Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
|
||||
Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
|
||||
res = _mm512_insertf64x4(res, _mm256_or_pd(lane1_a, lane1_b), 1);
|
||||
|
||||
return res;
|
||||
return _mm512_castsi512_pd(por(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a,
|
||||
const Packet16f& b) {
|
||||
EIGEN_STRONG_INLINE Packet16i pxor<Packet16i>(const Packet16i& a, const Packet16i& b) {
|
||||
return _mm512_xor_si512(a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a, const Packet16f& b) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
return _mm512_xor_ps(a, b);
|
||||
#else
|
||||
Packet16f res = _mm512_undefined_ps();
|
||||
Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
|
||||
Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
|
||||
res = _mm512_insertf32x4(res, _mm_xor_ps(lane0_a, lane0_b), 0);
|
||||
|
||||
Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
|
||||
Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
|
||||
res = _mm512_insertf32x4(res, _mm_xor_ps(lane1_a, lane1_b), 1);
|
||||
|
||||
Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
|
||||
Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
|
||||
res = _mm512_insertf32x4(res, _mm_xor_ps(lane2_a, lane2_b), 2);
|
||||
|
||||
Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
|
||||
Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
|
||||
res = _mm512_insertf32x4(res, _mm_xor_ps(lane3_a, lane3_b), 3);
|
||||
|
||||
return res;
|
||||
return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a),_mm512_castps_si512(b)));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a,
|
||||
const Packet8d& b) {
|
||||
EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a, const Packet8d& b) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
return _mm512_xor_pd(a, b);
|
||||
#else
|
||||
Packet8d res = _mm512_undefined_pd();
|
||||
Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
|
||||
Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
|
||||
res = _mm512_insertf64x4(res, _mm256_xor_pd(lane0_a, lane0_b), 0);
|
||||
|
||||
Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
|
||||
Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
|
||||
res = _mm512_insertf64x4(res, _mm256_xor_pd(lane1_a, lane1_b), 1);
|
||||
|
||||
return res;
|
||||
return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a,
|
||||
const Packet16f& b) {
|
||||
EIGEN_STRONG_INLINE Packet16i pandnot<Packet16i>(const Packet16i& a, const Packet16i& b) {
|
||||
return _mm512_andnot_si512(b, a);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a, const Packet16f& b) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
return _mm512_andnot_ps(a, b);
|
||||
return _mm512_andnot_ps(b, a);
|
||||
#else
|
||||
Packet16f res = _mm512_undefined_ps();
|
||||
Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
|
||||
Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
|
||||
res = _mm512_insertf32x4(res, _mm_andnot_ps(lane0_a, lane0_b), 0);
|
||||
|
||||
Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
|
||||
Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
|
||||
res = _mm512_insertf32x4(res, _mm_andnot_ps(lane1_a, lane1_b), 1);
|
||||
|
||||
Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
|
||||
Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
|
||||
res = _mm512_insertf32x4(res, _mm_andnot_ps(lane2_a, lane2_b), 2);
|
||||
|
||||
Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
|
||||
Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
|
||||
res = _mm512_insertf32x4(res, _mm_andnot_ps(lane3_a, lane3_b), 3);
|
||||
|
||||
return res;
|
||||
return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a),_mm512_castps_si512(b)));
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,
|
||||
const Packet8d& b) {
|
||||
EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,const Packet8d& b) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
return _mm512_andnot_pd(a, b);
|
||||
return _mm512_andnot_pd(b, a);
|
||||
#else
|
||||
Packet8d res = _mm512_undefined_pd();
|
||||
Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
|
||||
Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
|
||||
res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane0_a, lane0_b), 0);
|
||||
|
||||
Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
|
||||
Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
|
||||
res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane1_a, lane1_b), 1);
|
||||
|
||||
return res;
|
||||
return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
|
||||
#endif
|
||||
}
|
||||
|
||||
template<int N> EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) {
|
||||
return _mm512_srai_epi32(a, N);
|
||||
}
|
||||
|
||||
template<int N> EIGEN_STRONG_INLINE Packet16i plogical_shift_right(Packet16i a) {
|
||||
return _mm512_srli_epi32(a, N);
|
||||
}
|
||||
|
||||
template<int N> EIGEN_STRONG_INLINE Packet16i plogical_shift_left(Packet16i a) {
|
||||
return _mm512_slli_epi32(a, N);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
|
||||
EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from);
|
||||
@@ -461,75 +455,55 @@ EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) {
|
||||
// {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
|
||||
Packet8f lane0 = _mm256_broadcast_ps((const __m128*)(const void*)from);
|
||||
// mimic an "inplace" permutation of the lower 128bits using a blend
|
||||
lane0 = _mm256_blend_ps(
|
||||
lane0, _mm256_castps128_ps256(_mm_permute_ps(
|
||||
_mm256_castps256_ps128(lane0), _MM_SHUFFLE(1, 0, 1, 0))),
|
||||
15);
|
||||
// then we can perform a consistent permutation on the global register to get
|
||||
// everything in shape:
|
||||
lane0 = _mm256_permute_ps(lane0, _MM_SHUFFLE(3, 3, 2, 2));
|
||||
|
||||
Packet8f lane1 = _mm256_broadcast_ps((const __m128*)(const void*)(from + 4));
|
||||
// mimic an "inplace" permutation of the lower 128bits using a blend
|
||||
lane1 = _mm256_blend_ps(
|
||||
lane1, _mm256_castps128_ps256(_mm_permute_ps(
|
||||
_mm256_castps256_ps128(lane1), _MM_SHUFFLE(1, 0, 1, 0))),
|
||||
15);
|
||||
// then we can perform a consistent permutation on the global register to get
|
||||
// everything in shape:
|
||||
lane1 = _mm256_permute_ps(lane1, _MM_SHUFFLE(3, 3, 2, 2));
|
||||
// an unaligned load is required here as there is no requirement
|
||||
// on the alignment of input pointer 'from'
|
||||
__m256i low_half = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
|
||||
__m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half));
|
||||
__m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
return pairs;
|
||||
}
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
Packet16f res = _mm512_undefined_ps();
|
||||
return _mm512_insertf32x8(res, lane0, 0);
|
||||
return _mm512_insertf32x8(res, lane1, 1);
|
||||
return res;
|
||||
#else
|
||||
Packet16f res = _mm512_undefined_ps();
|
||||
res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 0), 0);
|
||||
res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 1), 1);
|
||||
res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 0), 2);
|
||||
res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 1), 3);
|
||||
return res;
|
||||
#endif
|
||||
}
|
||||
// FIXME: this does not look optimal, better load a Packet4d and shuffle...
|
||||
// Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3,
|
||||
// a3}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
|
||||
Packet4d lane0 = _mm256_broadcast_pd((const __m128d*)(const void*)from);
|
||||
lane0 = _mm256_permute_pd(lane0, 3 << 2);
|
||||
|
||||
Packet4d lane1 = _mm256_broadcast_pd((const __m128d*)(const void*)(from + 2));
|
||||
lane1 = _mm256_permute_pd(lane1, 3 << 2);
|
||||
|
||||
Packet8d res = _mm512_undefined_pd();
|
||||
res = _mm512_insertf64x4(res, lane0, 0);
|
||||
return _mm512_insertf64x4(res, lane1, 1);
|
||||
__m512d x = _mm512_setzero_pd();
|
||||
x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[0]), 0);
|
||||
x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[1]), 1);
|
||||
x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[2]), 2);
|
||||
x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3);
|
||||
return x;
|
||||
}
|
||||
#else
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
|
||||
__m512d x = _mm512_setzero_pd();
|
||||
x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0));
|
||||
x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1));
|
||||
x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2));
|
||||
x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3));
|
||||
return x;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Loads 4 floats from memory a returns the packet
|
||||
// {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
|
||||
Packet16f tmp = _mm512_undefined_ps();
|
||||
tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from), 0);
|
||||
tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 1), 1);
|
||||
tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 2), 2);
|
||||
tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 3), 3);
|
||||
return tmp;
|
||||
Packet16f tmp = _mm512_castps128_ps512(ploadu<Packet4f>(from));
|
||||
const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
|
||||
return _mm512_permutexvar_ps(scatter_mask, tmp);
|
||||
}
|
||||
|
||||
// Loads 2 doubles from memory a returns the packet
|
||||
// {a0, a0 a0, a0, a1, a1, a1, a1}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
|
||||
Packet8d tmp = _mm512_undefined_pd();
|
||||
Packet2d tmp0 = _mm_load_pd1(from);
|
||||
Packet2d tmp1 = _mm_load_pd1(from + 1);
|
||||
Packet4d lane0 = _mm256_broadcastsd_pd(tmp0);
|
||||
Packet4d lane1 = _mm256_broadcastsd_pd(tmp1);
|
||||
__m256d lane0 = _mm256_set1_pd(*from);
|
||||
__m256d lane1 = _mm256_set1_pd(*(from+1));
|
||||
__m512d tmp = _mm512_undefined_pd();
|
||||
tmp = _mm512_insertf64x4(tmp, lane0, 0);
|
||||
return _mm512_insertf64x4(tmp, lane1, 1);
|
||||
}
|
||||
@@ -565,7 +539,7 @@ EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
|
||||
Index stride) {
|
||||
Packet16i stride_vector = _mm512_set1_epi32(stride);
|
||||
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
|
||||
Packet16i stride_multiplier =
|
||||
_mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
|
||||
@@ -575,7 +549,7 @@ EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
|
||||
Index stride) {
|
||||
Packet8i stride_vector = _mm256_set1_epi32(stride);
|
||||
Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
|
||||
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
||||
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
|
||||
|
||||
@@ -586,7 +560,7 @@ template <>
|
||||
EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
|
||||
const Packet16f& from,
|
||||
Index stride) {
|
||||
Packet16i stride_vector = _mm512_set1_epi32(stride);
|
||||
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
|
||||
Packet16i stride_multiplier =
|
||||
_mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
|
||||
@@ -596,7 +570,7 @@ template <>
|
||||
EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
|
||||
const Packet8d& from,
|
||||
Index stride) {
|
||||
Packet8i stride_vector = _mm256_set1_epi32(stride);
|
||||
Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
|
||||
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
||||
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
|
||||
_mm512_i32scatter_pd(to, indices, from, 8);
|
||||
@@ -660,8 +634,8 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
|
||||
#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
|
||||
__m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0) __m256 OUTPUT##_1 = \
|
||||
_mm512_extractf32x8_ps(INPUT, 1)
|
||||
__m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \
|
||||
__m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1)
|
||||
#else
|
||||
#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
|
||||
__m256 OUTPUT##_0 = _mm256_insertf128_ps( \
|
||||
@@ -674,17 +648,136 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
|
||||
OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0); \
|
||||
OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1);
|
||||
OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
|
||||
#else
|
||||
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
|
||||
OUTPUT = _mm512_undefined_ps(); \
|
||||
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
|
||||
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \
|
||||
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
|
||||
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
|
||||
#endif
|
||||
template<> EIGEN_STRONG_INLINE Packet16f preduxp<Packet16f>(const Packet16f*
|
||||
vecs)
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
__m256 lane0 = _mm512_extractf32x8_ps(a, 0);
|
||||
__m256 lane1 = _mm512_extractf32x8_ps(a, 1);
|
||||
Packet8f x = _mm256_add_ps(lane0, lane1);
|
||||
return predux<Packet8f>(x);
|
||||
#else
|
||||
__m128 lane0 = _mm512_extractf32x4_ps(a, 0);
|
||||
__m128 lane1 = _mm512_extractf32x4_ps(a, 1);
|
||||
__m128 lane2 = _mm512_extractf32x4_ps(a, 2);
|
||||
__m128 lane3 = _mm512_extractf32x4_ps(a, 3);
|
||||
__m128 sum = _mm_add_ps(_mm_add_ps(lane0, lane1), _mm_add_ps(lane2, lane3));
|
||||
sum = _mm_hadd_ps(sum, sum);
|
||||
sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
|
||||
return _mm_cvtss_f32(sum);
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
|
||||
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
__m256d sum = _mm256_add_pd(lane0, lane1);
|
||||
__m256d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
|
||||
return _mm_cvtsd_f64(_mm256_castpd256_pd128(_mm256_hadd_pd(tmp0, tmp0)));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
|
||||
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
|
||||
return padd(lane0, lane1);
|
||||
#else
|
||||
Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
|
||||
Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
|
||||
Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
|
||||
Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
|
||||
Packet4f sum0 = padd(lane0, lane2);
|
||||
Packet4f sum1 = padd(lane1, lane3);
|
||||
return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) {
|
||||
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
Packet4d res = padd(lane0, lane1);
|
||||
return res;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
|
||||
//#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
#if 0
|
||||
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
|
||||
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
|
||||
Packet8f res = pmul(lane0, lane1);
|
||||
res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
|
||||
res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
#else
|
||||
__m128 lane0 = _mm512_extractf32x4_ps(a, 0);
|
||||
__m128 lane1 = _mm512_extractf32x4_ps(a, 1);
|
||||
__m128 lane2 = _mm512_extractf32x4_ps(a, 2);
|
||||
__m128 lane3 = _mm512_extractf32x4_ps(a, 3);
|
||||
__m128 res = pmul(pmul(lane0, lane1), pmul(lane2, lane3));
|
||||
res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
|
||||
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
__m256d res = pmul(lane0, lane1);
|
||||
res = pmul(res, _mm256_permute2f128_pd(res, res, 1));
|
||||
return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1)));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) {
|
||||
__m128 lane0 = _mm512_extractf32x4_ps(a, 0);
|
||||
__m128 lane1 = _mm512_extractf32x4_ps(a, 1);
|
||||
__m128 lane2 = _mm512_extractf32x4_ps(a, 2);
|
||||
__m128 lane3 = _mm512_extractf32x4_ps(a, 3);
|
||||
__m128 res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
|
||||
res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) {
|
||||
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
__m256d res = _mm256_min_pd(lane0, lane1);
|
||||
res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
|
||||
return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) {
|
||||
__m128 lane0 = _mm512_extractf32x4_ps(a, 0);
|
||||
__m128 lane1 = _mm512_extractf32x4_ps(a, 1);
|
||||
__m128 lane2 = _mm512_extractf32x4_ps(a, 2);
|
||||
__m128 lane3 = _mm512_extractf32x4_ps(a, 3);
|
||||
__m128 res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
|
||||
res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
|
||||
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
__m256d res = _mm256_max_pd(lane0, lane1);
|
||||
res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
|
||||
return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16f preduxp<Packet16f>(const Packet16f* vecs)
|
||||
{
|
||||
EIGEN_EXTRACT_8f_FROM_16f(vecs[0], vecs0);
|
||||
EIGEN_EXTRACT_8f_FROM_16f(vecs[1], vecs1);
|
||||
@@ -873,174 +966,7 @@ template<> EIGEN_STRONG_INLINE Packet8d preduxp<Packet8d>(const Packet8d* vecs)
|
||||
|
||||
return _mm512_insertf64x4(final_output, final_1, 1);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
|
||||
//#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
#if 0
|
||||
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
|
||||
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
|
||||
Packet8f sum = padd(lane0, lane1);
|
||||
Packet8f tmp0 = _mm256_hadd_ps(sum, _mm256_permute2f128_ps(a, a, 1));
|
||||
tmp0 = _mm256_hadd_ps(tmp0, tmp0);
|
||||
return pfirst(_mm256_hadd_ps(tmp0, tmp0));
|
||||
#else
|
||||
Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
|
||||
Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
|
||||
Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
|
||||
Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
|
||||
Packet4f sum = padd(padd(lane0, lane1), padd(lane2, lane3));
|
||||
sum = _mm_hadd_ps(sum, sum);
|
||||
sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
|
||||
return pfirst(sum);
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
|
||||
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
Packet4d sum = padd(lane0, lane1);
|
||||
Packet4d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
|
||||
return pfirst(_mm256_hadd_pd(tmp0, tmp0));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
|
||||
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
|
||||
return padd(lane0, lane1);
|
||||
#else
|
||||
Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
|
||||
Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
|
||||
Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
|
||||
Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
|
||||
Packet4f sum0 = padd(lane0, lane2);
|
||||
Packet4f sum1 = padd(lane1, lane3);
|
||||
return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) {
|
||||
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
Packet4d res = padd(lane0, lane1);
|
||||
return res;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
|
||||
//#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
#if 0
|
||||
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
|
||||
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
|
||||
Packet8f res = pmul(lane0, lane1);
|
||||
res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
|
||||
res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
#else
|
||||
Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
|
||||
Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
|
||||
Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
|
||||
Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
|
||||
Packet4f res = pmul(pmul(lane0, lane1), pmul(lane2, lane3));
|
||||
res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
|
||||
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
Packet4d res = pmul(lane0, lane1);
|
||||
res = pmul(res, _mm256_permute2f128_pd(res, res, 1));
|
||||
return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1)));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) {
|
||||
Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
|
||||
Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
|
||||
Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
|
||||
Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
|
||||
Packet4f res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
|
||||
res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) {
|
||||
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
Packet4d res = _mm256_min_pd(lane0, lane1);
|
||||
res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
|
||||
return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) {
|
||||
Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
|
||||
Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
|
||||
Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
|
||||
Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
|
||||
Packet4f res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
|
||||
res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
|
||||
Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
Packet4d res = _mm256_max_pd(lane0, lane1);
|
||||
res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
|
||||
return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
|
||||
}
|
||||
|
||||
template <int Offset>
|
||||
struct palign_impl<Offset, Packet16f> {
|
||||
static EIGEN_STRONG_INLINE void run(Packet16f& first,
|
||||
const Packet16f& second) {
|
||||
if (Offset != 0) {
|
||||
__m512i first_idx = _mm512_set_epi32(
|
||||
Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11,
|
||||
Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6,
|
||||
Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset);
|
||||
|
||||
__m512i second_idx =
|
||||
_mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4,
|
||||
Offset - 5, Offset - 6, Offset - 7, Offset - 8,
|
||||
Offset - 9, Offset - 10, Offset - 11, Offset - 12,
|
||||
Offset - 13, Offset - 14, Offset - 15, Offset - 16);
|
||||
|
||||
unsigned short mask = 0xFFFF;
|
||||
mask <<= (16 - Offset);
|
||||
|
||||
first = _mm512_permutexvar_ps(first_idx, first);
|
||||
Packet16f tmp = _mm512_permutexvar_ps(second_idx, second);
|
||||
first = _mm512_mask_blend_ps(mask, first, tmp);
|
||||
}
|
||||
}
|
||||
};
|
||||
template <int Offset>
|
||||
struct palign_impl<Offset, Packet8d> {
|
||||
static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) {
|
||||
if (Offset != 0) {
|
||||
__m512i first_idx = _mm512_set_epi32(
|
||||
0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0,
|
||||
Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset);
|
||||
|
||||
__m512i second_idx = _mm512_set_epi32(
|
||||
0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0,
|
||||
Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8);
|
||||
|
||||
unsigned char mask = 0xFF;
|
||||
mask <<= (8 - Offset);
|
||||
|
||||
first = _mm512_permutexvar_pd(first_idx, first);
|
||||
Packet8d tmp = _mm512_permutexvar_pd(second_idx, second);
|
||||
first = _mm512_mask_blend_pd(mask, first, tmp);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
|
||||
@@ -1302,13 +1228,76 @@ EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/,
|
||||
return Packet16f();
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& /*ifPacket*/,
|
||||
const Packet8d& /*thenPacket*/,
|
||||
const Packet8d& /*elsePacket*/) {
|
||||
assert(false && "To be implemented");
|
||||
return Packet8d();
|
||||
EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket,
|
||||
const Packet8d& thenPacket,
|
||||
const Packet8d& elsePacket) {
|
||||
__mmask8 m = (ifPacket.select[0] )
|
||||
| (ifPacket.select[1]<<1)
|
||||
| (ifPacket.select[2]<<2)
|
||||
| (ifPacket.select[3]<<3)
|
||||
| (ifPacket.select[4]<<4)
|
||||
| (ifPacket.select[5]<<5)
|
||||
| (ifPacket.select[6]<<6)
|
||||
| (ifPacket.select[7]<<7);
|
||||
return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
|
||||
return _mm512_cvttps_epi32(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
|
||||
return _mm512_cvtepi32_ps(a);
|
||||
}
|
||||
|
||||
template <int Offset>
|
||||
struct palign_impl<Offset, Packet16f> {
|
||||
static EIGEN_STRONG_INLINE void run(Packet16f& first,
|
||||
const Packet16f& second) {
|
||||
if (Offset != 0) {
|
||||
__m512i first_idx = _mm512_set_epi32(
|
||||
Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11,
|
||||
Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6,
|
||||
Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset);
|
||||
|
||||
__m512i second_idx =
|
||||
_mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4,
|
||||
Offset - 5, Offset - 6, Offset - 7, Offset - 8,
|
||||
Offset - 9, Offset - 10, Offset - 11, Offset - 12,
|
||||
Offset - 13, Offset - 14, Offset - 15, Offset - 16);
|
||||
|
||||
unsigned short mask = 0xFFFF;
|
||||
mask <<= (16 - Offset);
|
||||
|
||||
first = _mm512_permutexvar_ps(first_idx, first);
|
||||
Packet16f tmp = _mm512_permutexvar_ps(second_idx, second);
|
||||
first = _mm512_mask_blend_ps(mask, first, tmp);
|
||||
}
|
||||
}
|
||||
};
|
||||
template <int Offset>
|
||||
struct palign_impl<Offset, Packet8d> {
|
||||
static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) {
|
||||
if (Offset != 0) {
|
||||
__m512i first_idx = _mm512_set_epi32(
|
||||
0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0,
|
||||
Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset);
|
||||
|
||||
__m512i second_idx = _mm512_set_epi32(
|
||||
0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0,
|
||||
Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8);
|
||||
|
||||
unsigned char mask = 0xFF;
|
||||
mask <<= (8 - Offset);
|
||||
|
||||
first = _mm512_permutexvar_pd(first_idx, first);
|
||||
Packet8d tmp = _mm512_permutexvar_pd(second_idx, second);
|
||||
first = _mm512_mask_blend_pd(mask, first, tmp);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
@@ -15,7 +15,9 @@ namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
static Packet4ui p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
|
||||
inline Packet4ui p4ui_CONJ_XOR() {
|
||||
return vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
|
||||
}
|
||||
#ifdef __VSX__
|
||||
#if defined(_BIG_ENDIAN)
|
||||
static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
|
||||
@@ -29,8 +31,54 @@ static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (P
|
||||
//---------- float ----------
|
||||
struct Packet2cf
|
||||
{
|
||||
EIGEN_STRONG_INLINE explicit Packet2cf() : v(p4f_ZERO) {}
|
||||
EIGEN_STRONG_INLINE explicit Packet2cf() {}
|
||||
EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
|
||||
|
||||
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b)
|
||||
{
|
||||
Packet4f v1, v2;
|
||||
|
||||
// Permute and multiply the real parts of a and b
|
||||
v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
|
||||
// Get the imaginary parts of a
|
||||
v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
|
||||
// multiply a_re * b
|
||||
v1 = vec_madd(v1, b.v, p4f_ZERO);
|
||||
// multiply a_im * b and get the conjugate result
|
||||
v2 = vec_madd(v2, b.v, p4f_ZERO);
|
||||
v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR())));
|
||||
// permute back to a proper order
|
||||
v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
|
||||
|
||||
return Packet2cf(padd<Packet4f>(v1, v2));
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) {
|
||||
v = pmul(Packet2cf(*this), b).v;
|
||||
return *this;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
|
||||
return Packet2cf(*this) *= b;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
|
||||
v = padd(v, b.v);
|
||||
return *this;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
|
||||
return Packet2cf(*this) += b;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
|
||||
v = psub(v, b.v);
|
||||
return *this;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
|
||||
return Packet2cf(*this) -= b;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
|
||||
return Packet2cf(-v);
|
||||
}
|
||||
|
||||
Packet4f v;
|
||||
};
|
||||
|
||||
@@ -82,14 +130,14 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<f
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
|
||||
{
|
||||
std::complex<float> EIGEN_ALIGN16 af[2];
|
||||
EIGEN_ALIGN16 std::complex<float> af[2];
|
||||
af[0] = from[0*stride];
|
||||
af[1] = from[1*stride];
|
||||
return pload<Packet2cf>(af);
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
|
||||
{
|
||||
std::complex<float> EIGEN_ALIGN16 af[2];
|
||||
EIGEN_ALIGN16 std::complex<float> af[2];
|
||||
pstore<std::complex<float> >((std::complex<float> *) af, from);
|
||||
to[0*stride] = af[0];
|
||||
to[1*stride] = af[1];
|
||||
@@ -98,26 +146,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v - b.v); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
|
||||
{
|
||||
Packet4f v1, v2;
|
||||
|
||||
// Permute and multiply the real parts of a and b
|
||||
v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
|
||||
// Get the imaginary parts of a
|
||||
v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
|
||||
// multiply a_re * b
|
||||
v1 = vec_madd(v1, b.v, p4f_ZERO);
|
||||
// multiply a_im * b and get the conjugate result
|
||||
v2 = vec_madd(v2, b.v, p4f_ZERO);
|
||||
v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
|
||||
// permute back to a proper order
|
||||
v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
|
||||
|
||||
return Packet2cf(padd<Packet4f>(v1, v2));
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR()))); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v, b.v)); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v, b.v)); }
|
||||
@@ -128,7 +157,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::co
|
||||
|
||||
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
|
||||
{
|
||||
std::complex<float> EIGEN_ALIGN16 res[2];
|
||||
EIGEN_ALIGN16 std::complex<float> res[2];
|
||||
pstore((float *)&res, a.v);
|
||||
|
||||
return res[0];
|
||||
@@ -152,7 +181,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packe
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
|
||||
{
|
||||
Packet4f b1, b2;
|
||||
#ifdef _BIG_ENDIAN
|
||||
#ifdef _BIG_ENDIAN
|
||||
b1 = vec_sld(vecs[0].v, vecs[1].v, 8);
|
||||
b2 = vec_sld(vecs[1].v, vecs[0].v, 8);
|
||||
#else
|
||||
@@ -260,6 +289,51 @@ struct Packet1cd
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet1cd() {}
|
||||
EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
|
||||
|
||||
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b)
|
||||
{
|
||||
Packet2d a_re, a_im, v1, v2;
|
||||
|
||||
// Permute and multiply the real parts of a and b
|
||||
a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
|
||||
// Get the imaginary parts of a
|
||||
a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
|
||||
// multiply a_re * b
|
||||
v1 = vec_madd(a_re, b.v, p2d_ZERO);
|
||||
// multiply a_im * b and get the conjugate result
|
||||
v2 = vec_madd(a_im, b.v, p2d_ZERO);
|
||||
v2 = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v2), reinterpret_cast<Packet4ui>(v2), 8));
|
||||
v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1));
|
||||
|
||||
return Packet1cd(padd<Packet2d>(v1, v2));
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
|
||||
v = pmul(Packet1cd(*this), b).v;
|
||||
return *this;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
|
||||
return Packet1cd(*this) *= b;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
|
||||
v = padd(v, b.v);
|
||||
return *this;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
|
||||
return Packet1cd(*this) += b;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
|
||||
v = psub(v, b.v);
|
||||
return *this;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
|
||||
return Packet1cd(*this) -= b;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
|
||||
return Packet1cd(-v);
|
||||
}
|
||||
|
||||
Packet2d v;
|
||||
};
|
||||
|
||||
@@ -296,19 +370,13 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<
|
||||
template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
|
||||
{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
|
||||
template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index)
|
||||
{
|
||||
std::complex<double> EIGEN_ALIGN16 af[2];
|
||||
af[0] = from[0*stride];
|
||||
af[1] = from[1*stride];
|
||||
return pload<Packet1cd>(af);
|
||||
return pload<Packet1cd>(from);
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride)
|
||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index)
|
||||
{
|
||||
std::complex<double> EIGEN_ALIGN16 af[2];
|
||||
pstore<std::complex<double> >(af, from);
|
||||
to[0*stride] = af[0];
|
||||
to[1*stride] = af[1];
|
||||
pstore<std::complex<double> >(to, from);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
|
||||
@@ -316,24 +384,6 @@ template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, con
|
||||
template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
|
||||
template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2))); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
|
||||
{
|
||||
Packet2d a_re, a_im, v1, v2;
|
||||
|
||||
// Permute and multiply the real parts of a and b
|
||||
a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
|
||||
// Get the imaginary parts of a
|
||||
a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
|
||||
// multiply a_re * b
|
||||
v1 = vec_madd(a_re, b.v, p2d_ZERO);
|
||||
// multiply a_im * b and get the conjugate result
|
||||
v2 = vec_madd(a_im, b.v, p2d_ZERO);
|
||||
v2 = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v2), reinterpret_cast<Packet4ui>(v2), 8));
|
||||
v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1));
|
||||
|
||||
return Packet1cd(padd<Packet2d>(v1, v2));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pand(a.v,b.v)); }
|
||||
template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(por(a.v,b.v)); }
|
||||
template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pxor(a.v,b.v)); }
|
||||
@@ -345,7 +395,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::c
|
||||
|
||||
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
|
||||
{
|
||||
std::complex<double> EIGEN_ALIGN16 res[2];
|
||||
EIGEN_ALIGN16 std::complex<double> res[2];
|
||||
pstore<std::complex<double> >(res, a);
|
||||
|
||||
return res[0];
|
||||
|
||||
@@ -22,10 +22,6 @@ namespace internal {
|
||||
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
|
||||
#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
|
||||
#endif
|
||||
|
||||
// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
|
||||
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
|
||||
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
|
||||
@@ -40,9 +36,8 @@ typedef __vector unsigned char Packet16uc;
|
||||
|
||||
// We don't want to write the same code all the time, but we need to reuse the constants
|
||||
// and it doesn't really work to declare them global, so we define macros instead
|
||||
|
||||
#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
|
||||
Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
|
||||
Packet4f p4f_##NAME = {X, X, X, X}
|
||||
|
||||
#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
|
||||
Packet4i p4i_##NAME = vec_splat_s32(X)
|
||||
@@ -64,7 +59,7 @@ typedef __vector unsigned char Packet16uc;
|
||||
|
||||
#define DST_CHAN 1
|
||||
#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
|
||||
|
||||
#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type
|
||||
|
||||
// These constants are endian-agnostic
|
||||
static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
|
||||
@@ -77,21 +72,15 @@ static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)
|
||||
static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
|
||||
#endif
|
||||
|
||||
static Packet4ui p4ui_SIGN = {0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u};
|
||||
static Packet4ui p4ui_PREV0DOT5 = {0x3EFFFFFFu, 0x3EFFFFFFu, 0x3EFFFFFFu, 0x3EFFFFFFu};
|
||||
|
||||
static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
|
||||
static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
|
||||
|
||||
static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
|
||||
static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
|
||||
|
||||
// Mask alignment
|
||||
#ifdef __PPC64__
|
||||
#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
|
||||
#else
|
||||
#define _EIGEN_MASK_ALIGNMENT 0xfffffff0
|
||||
#endif
|
||||
|
||||
#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
|
||||
|
||||
// Handle endianness properly while loading constants
|
||||
// Define global static constants:
|
||||
#ifdef _BIG_ENDIAN
|
||||
@@ -235,112 +224,127 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
|
||||
return s;
|
||||
}
|
||||
|
||||
template <typename Packet, typename Scalar>
|
||||
EIGEN_STRONG_INLINE Packet pload_common(const Scalar* from)
|
||||
{
|
||||
// some versions of GCC throw "unused-but-set-parameter".
|
||||
// ignoring these warnings for now.
|
||||
EIGEN_UNUSED_VARIABLE(from);
|
||||
EIGEN_DEBUG_ALIGNED_LOAD
|
||||
return vec_ld(0, from);
|
||||
}
|
||||
|
||||
// Need to define them first or we get specialization after instantiation errors
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
|
||||
{
|
||||
EIGEN_DEBUG_ALIGNED_LOAD
|
||||
#ifdef __VSX__
|
||||
return vec_vsx_ld(0, from);
|
||||
#else
|
||||
return vec_ld(0, from);
|
||||
#endif
|
||||
return pload_common<Packet4f, float>(from);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from)
|
||||
{
|
||||
EIGEN_DEBUG_ALIGNED_LOAD
|
||||
return pload_common<Packet4i, int>(from);
|
||||
}
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){
|
||||
// some versions of GCC throw "unused-but-set-parameter" (float *to).
|
||||
// ignoring these warnings for now.
|
||||
EIGEN_UNUSED_VARIABLE(to);
|
||||
EIGEN_DEBUG_ALIGNED_STORE
|
||||
#ifdef __VSX__
|
||||
return vec_vsx_ld(0, from);
|
||||
vec_xst(from, 0, to);
|
||||
#else
|
||||
return vec_ld(0, from);
|
||||
vec_st(from, 0, to);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
|
||||
{
|
||||
EIGEN_DEBUG_ALIGNED_STORE
|
||||
#ifdef __VSX__
|
||||
vec_vsx_st(from, 0, to);
|
||||
#else
|
||||
vec_st(from, 0, to);
|
||||
#endif
|
||||
pstore_common<Packet4f>(to, from);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from)
|
||||
{
|
||||
EIGEN_DEBUG_ALIGNED_STORE
|
||||
#ifdef __VSX__
|
||||
vec_vsx_st(from, 0, to);
|
||||
#else
|
||||
vec_st(from, 0, to);
|
||||
#endif
|
||||
pstore_common<Packet4i>(to, from);
|
||||
}
|
||||
|
||||
template<typename Packet>
|
||||
EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from)
|
||||
{
|
||||
Packet v = {from, from, from, from};
|
||||
return v;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
|
||||
Packet4f v = {from, from, from, from};
|
||||
return v;
|
||||
return pset1_size4<Packet4f>(from);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
|
||||
Packet4i v = {from, from, from, from};
|
||||
return v;
|
||||
return pset1_size4<Packet4i>(from);
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE void
|
||||
pbroadcast4<Packet4f>(const float *a,
|
||||
Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
|
||||
|
||||
template<typename Packet> EIGEN_STRONG_INLINE void
|
||||
pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a,
|
||||
Packet& a0, Packet& a1, Packet& a2, Packet& a3)
|
||||
{
|
||||
a3 = pload<Packet4f>(a);
|
||||
a3 = pload<Packet>(a);
|
||||
a0 = vec_splat(a3, 0);
|
||||
a1 = vec_splat(a3, 1);
|
||||
a2 = vec_splat(a3, 2);
|
||||
a3 = vec_splat(a3, 3);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void
|
||||
pbroadcast4<Packet4f>(const float *a,
|
||||
Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
|
||||
{
|
||||
pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE void
|
||||
pbroadcast4<Packet4i>(const int *a,
|
||||
Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
|
||||
{
|
||||
a3 = pload<Packet4i>(a);
|
||||
a0 = vec_splat(a3, 0);
|
||||
a1 = vec_splat(a3, 1);
|
||||
a2 = vec_splat(a3, 2);
|
||||
a3 = vec_splat(a3, 3);
|
||||
pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
|
||||
}
|
||||
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride)
|
||||
{
|
||||
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
|
||||
a[0] = from[0*stride];
|
||||
a[1] = from[1*stride];
|
||||
a[2] = from[2*stride];
|
||||
a[3] = from[3*stride];
|
||||
return pload<Packet>(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
|
||||
{
|
||||
float EIGEN_ALIGN16 af[4];
|
||||
af[0] = from[0*stride];
|
||||
af[1] = from[1*stride];
|
||||
af[2] = from[2*stride];
|
||||
af[3] = from[3*stride];
|
||||
return pload<Packet4f>(af);
|
||||
return pgather_common<Packet4f>(from, stride);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
|
||||
{
|
||||
int EIGEN_ALIGN16 ai[4];
|
||||
ai[0] = from[0*stride];
|
||||
ai[1] = from[1*stride];
|
||||
ai[2] = from[2*stride];
|
||||
ai[3] = from[3*stride];
|
||||
return pload<Packet4i>(ai);
|
||||
return pgather_common<Packet4i>(from, stride);
|
||||
}
|
||||
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
|
||||
{
|
||||
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
|
||||
pstore<__UNPACK_TYPE__(Packet)>(a, from);
|
||||
to[0*stride] = a[0];
|
||||
to[1*stride] = a[1];
|
||||
to[2*stride] = a[2];
|
||||
to[3*stride] = a[3];
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
|
||||
{
|
||||
float EIGEN_ALIGN16 af[4];
|
||||
pstore<float>(af, from);
|
||||
to[0*stride] = af[0];
|
||||
to[1*stride] = af[1];
|
||||
to[2*stride] = af[2];
|
||||
to[3*stride] = af[3];
|
||||
pscatter_size4<Packet4f>(to, from, stride);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
|
||||
{
|
||||
int EIGEN_ALIGN16 ai[4];
|
||||
pstore<int>((int *)ai, from);
|
||||
to[0*stride] = ai[0];
|
||||
to[1*stride] = ai[1];
|
||||
to[2*stride] = ai[2];
|
||||
to[3*stride] = ai[3];
|
||||
pscatter_size4<Packet4i>(to, from, stride);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
|
||||
@@ -424,66 +428,67 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return vec_round(a); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
|
||||
{
|
||||
Packet4f t = vec_add(reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
|
||||
Packet4f res;
|
||||
|
||||
#ifdef __VSX__
|
||||
__asm__("xvrspiz %x0, %x1\n\t"
|
||||
: "=&wa" (res)
|
||||
: "wa" (t));
|
||||
#else
|
||||
__asm__("vrfiz %0, %1\n\t"
|
||||
: "=v" (res)
|
||||
: "v" (t));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return vec_ceil(a); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
|
||||
|
||||
#ifdef _BIG_ENDIAN
|
||||
template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from)
|
||||
{
|
||||
EIGEN_DEBUG_UNALIGNED_LOAD
|
||||
Packet16uc mask = vec_lvsl(0, from); // create the permute mask
|
||||
Packet16uc MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
|
||||
Packet16uc LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
|
||||
//TODO: Add static_cast here
|
||||
return (Packet) vec_perm(MSQ, LSQ, mask); // align the data
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
|
||||
{
|
||||
EIGEN_DEBUG_ALIGNED_LOAD
|
||||
Packet16uc MSQ, LSQ;
|
||||
Packet16uc mask;
|
||||
MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
|
||||
LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
|
||||
mask = vec_lvsl(0, from); // create the permute mask
|
||||
return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data
|
||||
|
||||
return ploadu_common<Packet4f>(from);
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
|
||||
{
|
||||
EIGEN_DEBUG_ALIGNED_LOAD
|
||||
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
|
||||
Packet16uc MSQ, LSQ;
|
||||
Packet16uc mask;
|
||||
MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
|
||||
LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
|
||||
mask = vec_lvsl(0, from); // create the permute mask
|
||||
return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data
|
||||
return ploadu_common<Packet4i>(from);
|
||||
}
|
||||
#else
|
||||
// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
|
||||
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
|
||||
{
|
||||
EIGEN_DEBUG_UNALIGNED_LOAD
|
||||
return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from));
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
|
||||
{
|
||||
EIGEN_DEBUG_UNALIGNED_LOAD
|
||||
return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from));
|
||||
}
|
||||
#endif
|
||||
|
||||
template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from)
|
||||
{
|
||||
Packet p;
|
||||
if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet>(from);
|
||||
else p = ploadu<Packet>(from);
|
||||
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
|
||||
{
|
||||
Packet4f p;
|
||||
if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4f>(from);
|
||||
else p = ploadu<Packet4f>(from);
|
||||
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
|
||||
return ploaddup_common<Packet4f>(from);
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
|
||||
{
|
||||
Packet4i p;
|
||||
if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4i>(from);
|
||||
else p = ploadu<Packet4i>(from);
|
||||
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
|
||||
return ploaddup_common<Packet4i>(from);
|
||||
}
|
||||
|
||||
#ifdef _BIG_ENDIAN
|
||||
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
|
||||
template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)* to, const Packet& from)
|
||||
{
|
||||
EIGEN_DEBUG_UNALIGNED_STORE
|
||||
#ifdef __VSX__
|
||||
vec_xst(from, 0, to);
|
||||
#else
|
||||
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
|
||||
// Warning: not thread safe!
|
||||
Packet16uc MSQ, LSQ, edges;
|
||||
@@ -497,45 +502,23 @@ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& f
|
||||
MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ)
|
||||
LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ)
|
||||
vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
|
||||
vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part
|
||||
vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part second
|
||||
#endif
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
|
||||
{
|
||||
pstoreu_common<Packet4f>(to, from);
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
|
||||
{
|
||||
EIGEN_DEBUG_UNALIGNED_STORE
|
||||
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
|
||||
// Warning: not thread safe!
|
||||
Packet16uc MSQ, LSQ, edges;
|
||||
Packet16uc edgeAlign, align;
|
||||
|
||||
MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword
|
||||
LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword
|
||||
edgeAlign = vec_lvsl(0, to); // permute map to extract edges
|
||||
edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges
|
||||
align = vec_lvsr( 0, to ); // permute map to misalign data
|
||||
MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ)
|
||||
LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ)
|
||||
vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
|
||||
vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part
|
||||
pstoreu_common<Packet4i>(to, from);
|
||||
}
|
||||
#else
|
||||
// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
|
||||
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
|
||||
{
|
||||
EIGEN_DEBUG_ALIGNED_STORE
|
||||
vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to));
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
|
||||
{
|
||||
EIGEN_DEBUG_ALIGNED_STORE
|
||||
vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
|
||||
}
|
||||
#endif
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
|
||||
template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
|
||||
template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; }
|
||||
template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int x; vec_ste(a, 0, &x); return x; }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
|
||||
{
|
||||
@@ -643,37 +626,42 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
|
||||
}
|
||||
|
||||
// min
|
||||
template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
|
||||
template<typename Packet> EIGEN_STRONG_INLINE
|
||||
__UNPACK_TYPE__(Packet) predux_min4(const Packet& a)
|
||||
{
|
||||
Packet4f b, res;
|
||||
Packet b, res;
|
||||
b = vec_min(a, vec_sld(a, a, 8));
|
||||
res = vec_min(b, vec_sld(b, b, 4));
|
||||
return pfirst(res);
|
||||
}
|
||||
|
||||
|
||||
template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
|
||||
{
|
||||
return predux_min4<Packet4f>(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
|
||||
{
|
||||
Packet4i b, res;
|
||||
b = vec_min(a, vec_sld(a, a, 8));
|
||||
res = vec_min(b, vec_sld(b, b, 4));
|
||||
return pfirst(res);
|
||||
return predux_min4<Packet4i>(a);
|
||||
}
|
||||
|
||||
// max
|
||||
template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
|
||||
template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a)
|
||||
{
|
||||
Packet4f b, res;
|
||||
Packet b, res;
|
||||
b = vec_max(a, vec_sld(a, a, 8));
|
||||
res = vec_max(b, vec_sld(b, b, 4));
|
||||
return pfirst(res);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
|
||||
{
|
||||
return predux_max4<Packet4f>(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
|
||||
{
|
||||
Packet4i b, res;
|
||||
b = vec_max(a, vec_sld(a, a, 8));
|
||||
res = vec_max(b, vec_sld(b, b, 4));
|
||||
return pfirst(res);
|
||||
return predux_max4<Packet4i>(a);
|
||||
}
|
||||
|
||||
template<int Offset>
|
||||
@@ -730,9 +718,9 @@ struct palign_impl<Offset,Packet4i>
|
||||
}
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void
|
||||
ptranspose(PacketBlock<Packet4f,4>& kernel) {
|
||||
Packet4f t0, t1, t2, t3;
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC inline void ptranpose_common(PacketBlock<T, 4>& kernel) {
|
||||
T t0, t1, t2, t3;
|
||||
t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
|
||||
t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
|
||||
t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
|
||||
@@ -743,29 +731,23 @@ ptranspose(PacketBlock<Packet4f,4>& kernel) {
|
||||
kernel.packet[3] = vec_mergel(t1, t3);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void
|
||||
ptranspose(PacketBlock<Packet4i,4>& kernel) {
|
||||
Packet4i t0, t1, t2, t3;
|
||||
t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
|
||||
t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
|
||||
t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
|
||||
t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
|
||||
kernel.packet[0] = vec_mergeh(t0, t2);
|
||||
kernel.packet[1] = vec_mergel(t0, t2);
|
||||
kernel.packet[2] = vec_mergeh(t1, t3);
|
||||
kernel.packet[3] = vec_mergel(t1, t3);
|
||||
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) { ptranpose_common<Packet4f>(kernel); }
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) { ptranpose_common<Packet4i>(kernel); }
|
||||
|
||||
template<typename Packet> EIGEN_STRONG_INLINE
|
||||
Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
|
||||
Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
|
||||
Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
|
||||
return vec_sel(elsePacket, thenPacket, mask);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
|
||||
Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
|
||||
Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
|
||||
return vec_sel(elsePacket, thenPacket, mask);
|
||||
return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
|
||||
Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
|
||||
Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
|
||||
return vec_sel(elsePacket, thenPacket, mask);
|
||||
return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
|
||||
}
|
||||
|
||||
|
||||
@@ -785,6 +767,8 @@ static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
|
||||
static Packet2d p2d_ONE = { 1.0, 1.0 };
|
||||
static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
|
||||
static Packet2d p2d_MZERO = { -0.0, -0.0 };
|
||||
static Packet2ul p2ul_SIGN = {0x8000000000000000ull, 0x8000000000000000ull};
|
||||
static Packet2ul p2ul_PREV0DOT5 = {0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull};
|
||||
|
||||
#ifdef _BIG_ENDIAN
|
||||
static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
|
||||
@@ -792,16 +776,9 @@ static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_c
|
||||
static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
|
||||
#endif
|
||||
|
||||
template<int index> Packet2d vec_splat_dbl(Packet2d& a);
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<0>(Packet2d& a)
|
||||
template<int index> Packet2d vec_splat_dbl(Packet2d& a)
|
||||
{
|
||||
return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_HI));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<1>(Packet2d& a)
|
||||
{
|
||||
return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_LO));
|
||||
return vec_splat(a, index);
|
||||
}
|
||||
|
||||
template<> struct packet_traits<double> : default_packet_traits
|
||||
@@ -826,7 +803,11 @@ template<> struct packet_traits<double> : default_packet_traits
|
||||
HasLog = 0,
|
||||
HasExp = 1,
|
||||
HasSqrt = 1,
|
||||
#if !EIGEN_COMP_CLANG
|
||||
HasRsqrt = 1,
|
||||
#else
|
||||
HasRsqrt = 0,
|
||||
#endif
|
||||
HasRound = 1,
|
||||
HasFloor = 1,
|
||||
HasCeil = 1,
|
||||
@@ -863,21 +844,13 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
|
||||
{
|
||||
EIGEN_DEBUG_ALIGNED_LOAD
|
||||
#ifdef __VSX__
|
||||
return vec_vsx_ld(0, from);
|
||||
#else
|
||||
return vec_ld(0, from);
|
||||
#endif
|
||||
return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
|
||||
{
|
||||
EIGEN_DEBUG_ALIGNED_STORE
|
||||
#ifdef __VSX__
|
||||
vec_vsx_st(from, 0, to);
|
||||
#else
|
||||
vec_st(from, 0, to);
|
||||
#endif
|
||||
vec_xst(from, 0, to);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
|
||||
@@ -889,24 +862,23 @@ template<> EIGEN_STRONG_INLINE void
|
||||
pbroadcast4<Packet2d>(const double *a,
|
||||
Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
|
||||
{
|
||||
a1 = pload<Packet2d>(a);
|
||||
a0 = vec_splat_dbl<0>(a1);
|
||||
a1 = vec_splat_dbl<1>(a1);
|
||||
a3 = pload<Packet2d>(a+2);
|
||||
a2 = vec_splat_dbl<0>(a3);
|
||||
a3 = vec_splat_dbl<1>(a3);
|
||||
//This way is faster than vec_splat (at least for doubles in Power 9)
|
||||
a0 = pset1<Packet2d>(a[0]);
|
||||
a1 = pset1<Packet2d>(a[1]);
|
||||
a2 = pset1<Packet2d>(a[2]);
|
||||
a3 = pset1<Packet2d>(a[3]);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
|
||||
{
|
||||
double EIGEN_ALIGN16 af[2];
|
||||
EIGEN_ALIGN16 double af[2];
|
||||
af[0] = from[0*stride];
|
||||
af[1] = from[1*stride];
|
||||
return pload<Packet2d>(af);
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
|
||||
{
|
||||
double EIGEN_ALIGN16 af[2];
|
||||
EIGEN_ALIGN16 double af[2];
|
||||
pstore<double>(af, from);
|
||||
to[0*stride] = af[0];
|
||||
to[1*stride] = af[1];
|
||||
@@ -918,7 +890,14 @@ template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; }
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
|
||||
{
|
||||
#ifdef __POWER8_VECTOR__
|
||||
return vec_neg(a);
|
||||
#else
|
||||
return vec_xor(a, p2d_MZERO);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
|
||||
|
||||
@@ -950,14 +929,24 @@ template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
|
||||
{
|
||||
Packet2d t = vec_add(reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
|
||||
Packet2d res;
|
||||
|
||||
__asm__("xvrdpiz %x0, %x1\n\t"
|
||||
: "=&wa" (res)
|
||||
: "wa" (t));
|
||||
|
||||
return res;
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
|
||||
{
|
||||
EIGEN_DEBUG_ALIGNED_LOAD
|
||||
return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from));
|
||||
EIGEN_DEBUG_UNALIGNED_LOAD
|
||||
return vec_xl(0, const_cast<double*>(from));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
|
||||
@@ -970,13 +959,13 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from)
|
||||
{
|
||||
EIGEN_DEBUG_ALIGNED_STORE
|
||||
vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
|
||||
EIGEN_DEBUG_UNALIGNED_STORE
|
||||
vec_xst(from, 0, to);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; }
|
||||
template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
|
||||
{
|
||||
|
||||
@@ -16,7 +16,7 @@ namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
|
||||
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
|
||||
|
||||
// Many std::complex methods such as operator+, operator-, operator* and
|
||||
// operator/ are not constexpr. Due to this, clang does not treat them as device
|
||||
@@ -55,7 +55,7 @@ template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T
|
||||
// Product
|
||||
template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
|
||||
enum {
|
||||
Vectorizable = packet_traits<std::complex<T>>::HasMul
|
||||
Vectorizable = packet_traits<std::complex<T> >::HasMul
|
||||
};
|
||||
typedef typename std::complex<T> result_type;
|
||||
|
||||
@@ -76,7 +76,7 @@ template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> >
|
||||
// Quotient
|
||||
template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
|
||||
enum {
|
||||
Vectorizable = packet_traits<std::complex<T>>::HasDiv
|
||||
Vectorizable = packet_traits<std::complex<T> >::HasDiv
|
||||
};
|
||||
typedef typename std::complex<T> result_type;
|
||||
|
||||
|
||||
@@ -42,6 +42,7 @@
|
||||
#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type()
|
||||
#endif
|
||||
|
||||
#include <sstream>
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
@@ -56,7 +57,7 @@ struct __half_raw {
|
||||
explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
|
||||
unsigned short x;
|
||||
};
|
||||
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
#elif EIGEN_CUDA_SDK_VER < 90000
|
||||
// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
|
||||
typedef __half __half_raw;
|
||||
#endif
|
||||
@@ -69,7 +70,7 @@ struct half_base : public __half_raw {
|
||||
EIGEN_DEVICE_FUNC half_base() {}
|
||||
EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
|
||||
EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
|
||||
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
|
||||
#endif
|
||||
};
|
||||
@@ -78,7 +79,7 @@ struct half_base : public __half_raw {
|
||||
|
||||
// Class definition.
|
||||
struct half : public half_impl::half_base {
|
||||
#if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
|
||||
#if !defined(EIGEN_HAS_CUDA_FP16) || (EIGEN_CUDA_SDK_VER < 90000)
|
||||
typedef half_impl::__half_raw __half_raw;
|
||||
#endif
|
||||
|
||||
@@ -86,7 +87,7 @@ struct half : public half_impl::half_base {
|
||||
|
||||
EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
|
||||
EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
|
||||
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
|
||||
#endif
|
||||
|
||||
@@ -208,56 +209,56 @@ namespace half_impl {
|
||||
// versions to get the ALU speed increased), but you do save the
|
||||
// conversion steps back and forth.
|
||||
|
||||
EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
|
||||
return __hadd(a, b);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
|
||||
return __hadd(static_cast<__half>(a), static_cast<__half>(b));
|
||||
}
|
||||
EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
|
||||
return __hmul(a, b);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) {
|
||||
return __hmul(static_cast<__half>(a), static_cast<__half>(b));
|
||||
}
|
||||
EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
|
||||
return __hsub(a, b);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) {
|
||||
return __hsub(static_cast<__half>(a), static_cast<__half>(b));
|
||||
}
|
||||
EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) {
|
||||
float num = __half2float(a);
|
||||
float denom = __half2float(b);
|
||||
return __float2half(num / denom);
|
||||
}
|
||||
EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
|
||||
return __hneg(a);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) {
|
||||
return __hneg(static_cast<__half>(a));
|
||||
}
|
||||
EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) {
|
||||
a = a + b;
|
||||
return a;
|
||||
}
|
||||
EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) {
|
||||
a = a * b;
|
||||
return a;
|
||||
}
|
||||
EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) {
|
||||
a = a - b;
|
||||
return a;
|
||||
}
|
||||
EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) {
|
||||
a = a / b;
|
||||
return a;
|
||||
}
|
||||
EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
|
||||
return __heq(a, b);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
|
||||
return __heq(static_cast<__half>(a), static_cast<__half>(b));
|
||||
}
|
||||
EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
|
||||
return __hne(a, b);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
|
||||
return __hne(static_cast<__half>(a), static_cast<__half>(b));
|
||||
}
|
||||
EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
|
||||
return __hlt(a, b);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
|
||||
return __hlt(static_cast<__half>(a), static_cast<__half>(b));
|
||||
}
|
||||
EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
|
||||
return __hle(a, b);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) {
|
||||
return __hle(static_cast<__half>(a), static_cast<__half>(b));
|
||||
}
|
||||
EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
|
||||
return __hgt(a, b);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
|
||||
return __hgt(static_cast<__half>(a), static_cast<__half>(b));
|
||||
}
|
||||
EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
|
||||
return __hge(a, b);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) {
|
||||
return __hge(static_cast<__half>(a), static_cast<__half>(b));
|
||||
}
|
||||
|
||||
#else // Emulate support for half floats
|
||||
@@ -448,14 +449,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
|
||||
return result;
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
|
||||
#if EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
|
||||
return half(hexp(a));
|
||||
#else
|
||||
return half(::expf(float(a)));
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
return half(::hlog(a));
|
||||
#else
|
||||
return half(::logf(float(a)));
|
||||
@@ -468,7 +469,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
|
||||
return half(::log10f(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
|
||||
#if EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
|
||||
return half(hsqrt(a));
|
||||
#else
|
||||
return half(::sqrtf(float(a)));
|
||||
@@ -490,14 +491,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
|
||||
return half(::tanhf(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
|
||||
#if EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
|
||||
return half(hfloor(a));
|
||||
#else
|
||||
return half(::floorf(float(a)));
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
|
||||
#if EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
|
||||
return half(hceil(a));
|
||||
#else
|
||||
return half(::ceilf(float(a)));
|
||||
@@ -540,7 +541,7 @@ struct random_default_impl<half, false, false>
|
||||
{
|
||||
static inline half run(const half& x, const half& y)
|
||||
{
|
||||
return x + (y-x) * half(float(std::rand()) / float(RAND_MAX));
|
||||
return x + (y-x) * half(random<float>());
|
||||
}
|
||||
static inline half run()
|
||||
{
|
||||
@@ -592,7 +593,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
|
||||
return Eigen::half(::expf(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
#if EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
return Eigen::half(::hlog(a));
|
||||
#else
|
||||
return Eigen::half(::logf(float(a)));
|
||||
@@ -625,25 +626,71 @@ struct hash<Eigen::half> {
|
||||
} // end namespace std
|
||||
|
||||
|
||||
// Add the missing shfl_xor intrinsic
|
||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
|
||||
#if EIGEN_CUDACC_VER < 90000
|
||||
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
|
||||
#else
|
||||
return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
|
||||
#endif
|
||||
// Add the missing shfl* intrinsics.
|
||||
// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
|
||||
// CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
|
||||
//
|
||||
// HIP and CUDA prior to SDK 9.0 define
|
||||
// __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
|
||||
// CUDA since 9.0 deprecates those and instead defines
|
||||
// __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
|
||||
// with native support for __half and __nv_bfloat16
|
||||
//
|
||||
// Note that the following are __device__ - only functions.
|
||||
#if defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
|
||||
|
||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane,
|
||||
int width = warpSize) {
|
||||
const __half h = var;
|
||||
return static_cast<Eigen::half>(__shfl_sync(mask, h, srcLane, width));
|
||||
}
|
||||
|
||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up_sync(unsigned mask, Eigen::half var, unsigned int delta,
|
||||
int width = warpSize) {
|
||||
const __half h = var;
|
||||
return static_cast<Eigen::half>(__shfl_up_sync(mask, h, delta, width));
|
||||
}
|
||||
|
||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down_sync(unsigned mask, Eigen::half var, unsigned int delta,
|
||||
int width = warpSize) {
|
||||
const __half h = var;
|
||||
return static_cast<Eigen::half>(__shfl_down_sync(mask, h, delta, width));
|
||||
}
|
||||
|
||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen::half var, int laneMask,
|
||||
int width = warpSize) {
|
||||
const __half h = var;
|
||||
return static_cast<Eigen::half>(__shfl_xor_sync(mask, h, laneMask, width));
|
||||
}
|
||||
|
||||
#else // CUDA SDK < 9.0
|
||||
|
||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) {
|
||||
return static_cast<Eigen::half>(__shfl(static_cast<float>(var), srcLane, width));
|
||||
}
|
||||
|
||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width = warpSize) {
|
||||
return static_cast<Eigen::half>(__shfl_up(static_cast<float>(var), delta, width));
|
||||
}
|
||||
|
||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width = warpSize) {
|
||||
return static_cast<Eigen::half>(__shfl_down(static_cast<float>(var), delta, width));
|
||||
}
|
||||
|
||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width = warpSize) {
|
||||
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // __shfl*
|
||||
|
||||
// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
|
||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
|
||||
return Eigen::half_impl::raw_uint16_to_half(
|
||||
__ldg(reinterpret_cast<const unsigned short*>(ptr)));
|
||||
#if defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)
|
||||
EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) {
|
||||
return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast<const Eigen::numext::uint16_t*>(ptr)));
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // __ldg
|
||||
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
namespace Eigen {
|
||||
|
||||
@@ -17,7 +17,7 @@ namespace internal {
|
||||
// Make sure this is only available when targeting a GPU: we don't want to
|
||||
// introduce conflicts between these packet_traits definitions and the ones
|
||||
// we'll use on the host side (SSE, AVX, ...)
|
||||
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
|
||||
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
float4 plog<float4>(const float4& a)
|
||||
{
|
||||
|
||||
@@ -17,7 +17,7 @@ namespace internal {
|
||||
// Make sure this is only available when targeting a GPU: we don't want to
|
||||
// introduce conflicts between these packet_traits definitions and the ones
|
||||
// we'll use on the host side (SSE, AVX, ...)
|
||||
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
|
||||
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
|
||||
template<> struct is_arithmetic<float4> { enum { value = true }; };
|
||||
template<> struct is_arithmetic<double2> { enum { value = true }; };
|
||||
|
||||
@@ -167,10 +167,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const d
|
||||
return make_double2(from[0], from[1]);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
|
||||
return make_float4(from[0], from[0], from[1], from[1]);
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
|
||||
return make_double2(from[0], from[0]);
|
||||
}
|
||||
|
||||
@@ -197,7 +197,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
||||
return __ldg((const float4*)from);
|
||||
return __ldg(reinterpret_cast<const float4*>(from));
|
||||
#else
|
||||
return make_float4(from[0], from[1], from[2], from[3]);
|
||||
#endif
|
||||
@@ -205,7 +205,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
||||
return __ldg((const double2*)from);
|
||||
return __ldg(reinterpret_cast<const double2*>(from));
|
||||
#else
|
||||
return make_double2(from[0], from[1]);
|
||||
#endif
|
||||
|
||||
@@ -15,7 +15,7 @@ namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// Most of the following operations require arch >= 3.0
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
|
||||
template<> struct is_arithmetic<half2> { enum { value = true }; };
|
||||
|
||||
@@ -41,42 +41,42 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
|
||||
|
||||
template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
|
||||
return __half2half2(from);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
|
||||
return *reinterpret_cast<const half2*>(from);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
|
||||
return __halves2half2(from[0], from[1]);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) {
|
||||
return __halves2half2(from[0], from[0]);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
|
||||
*reinterpret_cast<half2*>(to) = from;
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
|
||||
to[0] = __low2half(from);
|
||||
to[1] = __high2half(from);
|
||||
}
|
||||
|
||||
template<>
|
||||
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
return __ldg((const half2*)from);
|
||||
return __ldg(reinterpret_cast<const half2*>(from));
|
||||
#else
|
||||
return __halves2half2(*(from+0), *(from+1));
|
||||
#endif
|
||||
}
|
||||
|
||||
template<>
|
||||
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
return __halves2half2(__ldg(from+0), __ldg(from+1));
|
||||
#else
|
||||
@@ -84,20 +84,20 @@ __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::ha
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
|
||||
return __halves2half2(from[0*stride], from[1*stride]);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
|
||||
to[stride*0] = __low2half(from);
|
||||
to[stride*1] = __high2half(from);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
|
||||
return __low2half(a);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
|
||||
half2 result;
|
||||
unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
|
||||
*(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
|
||||
@@ -105,7 +105,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
|
||||
}
|
||||
|
||||
|
||||
__device__ EIGEN_STRONG_INLINE void
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
|
||||
ptranspose(PacketBlock<half2,2>& kernel) {
|
||||
__half a1 = __low2half(kernel.packet[0]);
|
||||
__half a2 = __high2half(kernel.packet[0]);
|
||||
@@ -115,7 +115,7 @@ ptranspose(PacketBlock<half2,2>& kernel) {
|
||||
kernel.packet[1] = __halves2half2(a2, b2);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
|
||||
#else
|
||||
@@ -124,7 +124,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half&
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
return __hadd2(a, b);
|
||||
#else
|
||||
@@ -138,7 +138,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, cons
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
return __hsub2(a, b);
|
||||
#else
|
||||
@@ -152,7 +152,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, cons
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
return __hneg2(a);
|
||||
#else
|
||||
@@ -162,9 +162,9 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
return __hmul2(a, b);
|
||||
#else
|
||||
@@ -178,7 +178,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, cons
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
return __hfma2(a, b, c);
|
||||
#else
|
||||
@@ -194,7 +194,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, con
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
@@ -204,7 +204,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, cons
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
@@ -214,7 +214,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, cons
|
||||
return __halves2half2(r1, r2);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
@@ -224,17 +224,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, cons
|
||||
return __halves2half2(r1, r2);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
return __hadd(__low2half(a), __high2half(a));
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2)));
|
||||
return Eigen::half(__float2half_rn(a1 + a2));
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
__half first = __low2half(a);
|
||||
__half second = __high2half(a);
|
||||
@@ -246,7 +246,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const ha
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
__half first = __low2half(a);
|
||||
__half second = __high2half(a);
|
||||
@@ -258,17 +258,17 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const ha
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
return __hmul(__low2half(a), __high2half(a));
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2)));
|
||||
return Eigen::half(__float2half_rn(a1 * a2));
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = log1pf(a1);
|
||||
@@ -276,31 +276,31 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
|
||||
#if EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
half2 plog<half2>(const half2& a) {
|
||||
return h2log(a);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
half2 pexp<half2>(const half2& a) {
|
||||
return h2exp(a);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
half2 psqrt<half2>(const half2& a) {
|
||||
return h2sqrt(a);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
half2 prsqrt<half2>(const half2& a) {
|
||||
return h2rsqrt(a);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = logf(a1);
|
||||
@@ -308,7 +308,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = expf(a1);
|
||||
@@ -316,7 +316,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = sqrtf(a1);
|
||||
@@ -324,7 +324,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = rsqrtf(a1);
|
||||
|
||||
1662
Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
Normal file
1662
Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
Normal file
File diff suppressed because it is too large
Load Diff
@@ -32,7 +32,7 @@ namespace internal {
|
||||
#if EIGEN_ARCH_ARM64
|
||||
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
|
||||
#else
|
||||
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
|
||||
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -107,7 +107,7 @@ template<> struct packet_traits<float> : default_packet_traits
|
||||
AlignedOnScalar = 1,
|
||||
size = 4,
|
||||
HasHalfPacket=0, // Packet2f intrinsics not implemented yet
|
||||
|
||||
|
||||
HasDiv = 1,
|
||||
// FIXME check the Has*
|
||||
HasSin = 0,
|
||||
@@ -173,32 +173,48 @@ template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
|
||||
{
|
||||
#if EIGEN_ARCH_ARM64
|
||||
return vdivq_f32(a,b);
|
||||
#else
|
||||
Packet4f inv, restep, div;
|
||||
|
||||
// NEON does not offer a divide instruction, we have to do a reciprocal approximation
|
||||
// However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
|
||||
// a reciprocal estimate AND a reciprocal step -which saves a few instructions
|
||||
// vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
|
||||
// Newton-Raphson and vrecpsq_f32()
|
||||
inv = vrecpeq_f32(b);
|
||||
|
||||
// This returns a differential, by which we will have to multiply inv to get a better
|
||||
// approximation of 1/b.
|
||||
restep = vrecpsq_f32(b, inv);
|
||||
inv = vmulq_f32(restep, inv);
|
||||
|
||||
// Finally, multiply a by 1/b and get the wanted result of the division.
|
||||
div = vmulq_f32(a, inv);
|
||||
|
||||
return div;
|
||||
#endif
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
|
||||
return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
|
||||
return vreinterpretq_f32_u32(vcleq_f32(a, b));
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE Packet4f preciprocal(const Packet4f& a)
|
||||
{
|
||||
// Compute approximate reciprocal.
|
||||
float32x4_t result = vrecpeq_f32(a);
|
||||
result = vmulq_f32(vrecpsq_f32(a, result), result);
|
||||
result = vmulq_f32(vrecpsq_f32(a, result), result);
|
||||
return result;
|
||||
}
|
||||
|
||||
#if EIGEN_ARCH_ARM64
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return vdivq_f32(a, b); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) { return vdiv_f32(a, b); }
|
||||
#else
|
||||
template<typename Packet>
|
||||
EIGEN_STRONG_INLINE Packet pdiv_float_common(const Packet& a, const Packet& b) {
|
||||
// if b is large, NEON intrinsics will flush preciprocal(b) to zero
|
||||
// avoid underflow with the following manipulation:
|
||||
// a / b = f * (a * reciprocal(f * b))
|
||||
const Packet cst_one = pset1<Packet>(1.0f);
|
||||
const Packet cst_quarter = pset1<Packet>(0.25f);
|
||||
const Packet cst_thresh = pset1<Packet>(NumTraits<float>::highest() / 4.0f);
|
||||
|
||||
Packet b_will_underflow = pcmp_le(cst_thresh, pabs(b));
|
||||
Packet f = pselect(b_will_underflow, cst_quarter, cst_one);
|
||||
Packet result = pmul(f, pmul(a, preciprocal(pmul(b, f))));
|
||||
return result;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
||||
return pdiv_float_common(a, b);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
|
||||
{ eigen_assert(false && "packet integer division are not supported by NEON");
|
||||
return pset1<Packet4i>(0);
|
||||
@@ -208,7 +224,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
|
||||
// then implements a slow software scalar fallback calling fmaf()!
|
||||
// Filed LLVM bug:
|
||||
// https://llvm.org/bugs/show_bug.cgi?id=27216
|
||||
#if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM)
|
||||
#if (defined EIGEN_VECTORIZE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM)
|
||||
// See bug 936.
|
||||
// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4.
|
||||
// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding.
|
||||
@@ -478,7 +494,7 @@ template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
|
||||
a_hi = vget_high_s32(a);
|
||||
min = vpmin_s32(a_lo, a_hi);
|
||||
min = vpmin_s32(min, min);
|
||||
|
||||
|
||||
return vget_lane_s32(min, 0);
|
||||
}
|
||||
|
||||
@@ -595,7 +611,7 @@ template<> struct packet_traits<double> : default_packet_traits
|
||||
AlignedOnScalar = 1,
|
||||
size = 2,
|
||||
HasHalfPacket=0,
|
||||
|
||||
|
||||
HasDiv = 1,
|
||||
// FIXME check the Has*
|
||||
HasSin = 0,
|
||||
@@ -628,7 +644,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
|
||||
|
||||
#ifdef __ARM_FEATURE_FMA
|
||||
#ifdef EIGEN_VECTORIZE_FMA
|
||||
// See bug 936. See above comment about FMA for float.
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); }
|
||||
#else
|
||||
@@ -751,7 +767,7 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
|
||||
kernel.packet[0] = trn1;
|
||||
kernel.packet[1] = trn2;
|
||||
}
|
||||
#endif // EIGEN_ARCH_ARM64
|
||||
#endif // EIGEN_ARCH_ARM64
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
@@ -10,8 +10,6 @@
|
||||
#ifndef EIGEN_PACKET_MATH_ZVECTOR_H
|
||||
#define EIGEN_PACKET_MATH_ZVECTOR_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
@@ -46,10 +44,10 @@ typedef struct {
|
||||
} Packet4f;
|
||||
|
||||
typedef union {
|
||||
int32_t i[4];
|
||||
uint32_t ui[4];
|
||||
int64_t l[2];
|
||||
uint64_t ul[2];
|
||||
numext::int32_t i[4];
|
||||
numext::uint32_t ui[4];
|
||||
numext::int64_t l[2];
|
||||
numext::uint64_t ul[2];
|
||||
double d[2];
|
||||
Packet4i v4i;
|
||||
Packet4ui v4ui;
|
||||
|
||||
@@ -768,7 +768,7 @@ struct scalar_sign_op<Scalar,true> {
|
||||
if (aa==real_type(0))
|
||||
return Scalar(0);
|
||||
aa = real_type(1)/aa;
|
||||
return Scalar(real(a)*aa, imag(a)*aa );
|
||||
return Scalar(a.real()*aa, a.imag()*aa );
|
||||
}
|
||||
//TODO
|
||||
//template <typename Packet>
|
||||
|
||||
@@ -115,7 +115,8 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
// registers. However once the latency is hidden there is no point in
|
||||
// increasing the value of k, so we'll cap it at 320 (value determined
|
||||
// experimentally).
|
||||
const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320);
|
||||
// To avoid that k vanishes, we make k_cache at least as big as kr
|
||||
const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
|
||||
if (k_cache < k) {
|
||||
k = k_cache - (k_cache % kr);
|
||||
eigen_internal_assert(k > 0);
|
||||
@@ -648,8 +649,8 @@ public:
|
||||
// Vectorized path
|
||||
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
|
||||
{
|
||||
dest.first = pset1<RealPacket>(real(*b));
|
||||
dest.second = pset1<RealPacket>(imag(*b));
|
||||
dest.first = pset1<RealPacket>(numext::real(*b));
|
||||
dest.second = pset1<RealPacket>(numext::imag(*b));
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
|
||||
|
||||
@@ -20,8 +20,9 @@ template<typename _LhsScalar, typename _RhsScalar> class level3_blocking;
|
||||
template<
|
||||
typename Index,
|
||||
typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
|
||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
|
||||
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
|
||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride>
|
||||
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride>
|
||||
{
|
||||
typedef gebp_traits<RhsScalar,LhsScalar> Traits;
|
||||
|
||||
@@ -30,7 +31,7 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh
|
||||
Index rows, Index cols, Index depth,
|
||||
const LhsScalar* lhs, Index lhsStride,
|
||||
const RhsScalar* rhs, Index rhsStride,
|
||||
ResScalar* res, Index resStride,
|
||||
ResScalar* res, Index resIncr, Index resStride,
|
||||
ResScalar alpha,
|
||||
level3_blocking<RhsScalar,LhsScalar>& blocking,
|
||||
GemmParallelInfo<Index>* info = 0)
|
||||
@@ -39,8 +40,8 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh
|
||||
general_matrix_matrix_product<Index,
|
||||
RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
|
||||
LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
|
||||
ColMajor>
|
||||
::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info);
|
||||
ColMajor,ResInnerStride>
|
||||
::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking,info);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -49,8 +50,9 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh
|
||||
template<
|
||||
typename Index,
|
||||
typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
|
||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
|
||||
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
|
||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride>
|
||||
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride>
|
||||
{
|
||||
|
||||
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
||||
@@ -59,17 +61,17 @@ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScala
|
||||
static void run(Index rows, Index cols, Index depth,
|
||||
const LhsScalar* _lhs, Index lhsStride,
|
||||
const RhsScalar* _rhs, Index rhsStride,
|
||||
ResScalar* _res, Index resStride,
|
||||
ResScalar* _res, Index resIncr, Index resStride,
|
||||
ResScalar alpha,
|
||||
level3_blocking<LhsScalar,RhsScalar>& blocking,
|
||||
GemmParallelInfo<Index>* info = 0)
|
||||
{
|
||||
typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
RhsMapper rhs(_rhs,rhsStride);
|
||||
ResMapper res(_res, resStride);
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor,Unaligned,ResInnerStride> ResMapper;
|
||||
LhsMapper lhs(_lhs, lhsStride);
|
||||
RhsMapper rhs(_rhs, rhsStride);
|
||||
ResMapper res(_res, resStride, resIncr);
|
||||
|
||||
Index kc = blocking.kc(); // cache block size along the K direction
|
||||
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
|
||||
@@ -226,7 +228,7 @@ struct gemm_functor
|
||||
Gemm::run(rows, cols, m_lhs.cols(),
|
||||
&m_lhs.coeffRef(row,0), m_lhs.outerStride(),
|
||||
&m_rhs.coeffRef(0,col), m_rhs.outerStride(),
|
||||
(Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
|
||||
(Scalar*)&(m_dest.coeffRef(row,col)), m_dest.innerStride(), m_dest.outerStride(),
|
||||
m_actualAlpha, m_blocking, info);
|
||||
}
|
||||
|
||||
@@ -428,7 +430,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
||||
static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
|
||||
lazyproduct::evalTo(dst, lhs, rhs);
|
||||
lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar,Scalar>());
|
||||
else
|
||||
{
|
||||
dst.setZero();
|
||||
@@ -440,7 +442,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
||||
static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
|
||||
lazyproduct::addTo(dst, lhs, rhs);
|
||||
lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar,Scalar>());
|
||||
else
|
||||
scaleAndAddTo(dst,lhs, rhs, Scalar(1));
|
||||
}
|
||||
@@ -449,7 +451,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
||||
static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
|
||||
lazyproduct::subTo(dst, lhs, rhs);
|
||||
lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar,Scalar>());
|
||||
else
|
||||
scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
|
||||
}
|
||||
@@ -476,7 +478,8 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
||||
Index,
|
||||
LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
|
||||
RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
|
||||
(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
|
||||
(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,
|
||||
Dest::InnerStrideAtCompileTime>,
|
||||
ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor;
|
||||
|
||||
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
|
||||
#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjLhs, bool ConjRhs>
|
||||
struct selfadjoint_rank1_update;
|
||||
@@ -25,51 +25,54 @@ namespace internal {
|
||||
**********************************************************************/
|
||||
|
||||
// forward declarations (defined at the end of this file)
|
||||
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
|
||||
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int ResInnerStride, int UpLo>
|
||||
struct tribb_kernel;
|
||||
|
||||
|
||||
/* Optimized matrix-matrix product evaluating only one triangular half */
|
||||
template <typename Index,
|
||||
typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
|
||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResStorageOrder, int UpLo, int Version = Specialized>
|
||||
int ResStorageOrder, int ResInnerStride, int UpLo, int Version = Specialized>
|
||||
struct general_matrix_matrix_triangular_product;
|
||||
|
||||
// as usual if the result is row major => we transpose the product
|
||||
template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
|
||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int UpLo, int Version>
|
||||
struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,UpLo,Version>
|
||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride, int UpLo, int Version>
|
||||
struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride,UpLo,Version>
|
||||
{
|
||||
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
||||
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
|
||||
const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride,
|
||||
const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resIncr, Index resStride,
|
||||
const ResScalar& alpha, level3_blocking<RhsScalar,LhsScalar>& blocking)
|
||||
{
|
||||
general_matrix_matrix_triangular_product<Index,
|
||||
RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
|
||||
LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
|
||||
ColMajor, UpLo==Lower?Upper:Lower>
|
||||
::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking);
|
||||
ColMajor, ResInnerStride, UpLo==Lower?Upper:Lower>
|
||||
::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
|
||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int UpLo, int Version>
|
||||
struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Version>
|
||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride, int UpLo, int Version>
|
||||
struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,UpLo,Version>
|
||||
{
|
||||
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
||||
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
|
||||
const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride,
|
||||
const RhsScalar* _rhs, Index rhsStride,
|
||||
ResScalar* _res, Index resIncr, Index resStride,
|
||||
const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking)
|
||||
{
|
||||
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
||||
|
||||
typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
RhsMapper rhs(_rhs,rhsStride);
|
||||
ResMapper res(_res, resStride);
|
||||
ResMapper res(_res, resStride, resIncr);
|
||||
|
||||
Index kc = blocking.kc();
|
||||
Index mc = (std::min)(size,blocking.mc());
|
||||
@@ -87,7 +90,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
|
||||
gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
|
||||
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
|
||||
tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb;
|
||||
tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, ResInnerStride, UpLo> sybb;
|
||||
|
||||
for(Index k2=0; k2<depth; k2+=kc)
|
||||
{
|
||||
@@ -110,8 +113,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
|
||||
gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
|
||||
(std::min)(size,i2), alpha, -1, -1, 0, 0);
|
||||
|
||||
|
||||
sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
|
||||
sybb(_res+resStride*i2 + resIncr*i2, resIncr, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
|
||||
|
||||
if (UpLo==Upper)
|
||||
{
|
||||
@@ -133,7 +135,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
|
||||
// while the triangular block overlapping the diagonal is evaluated into a
|
||||
// small temporary buffer which is then accumulated into the result using a
|
||||
// triangular traversal.
|
||||
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
|
||||
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int ResInnerStride, int UpLo>
|
||||
struct tribb_kernel
|
||||
{
|
||||
typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits;
|
||||
@@ -142,11 +144,13 @@ struct tribb_kernel
|
||||
enum {
|
||||
BlockSize = meta_least_common_multiple<EIGEN_PLAIN_ENUM_MAX(mr,nr),EIGEN_PLAIN_ENUM_MIN(mr,nr)>::ret
|
||||
};
|
||||
void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
|
||||
void operator()(ResScalar* _res, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
|
||||
{
|
||||
typedef blas_data_mapper<ResScalar, Index, ColMajor> ResMapper;
|
||||
ResMapper res(_res, resStride);
|
||||
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
|
||||
typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
|
||||
typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned> BufferMapper;
|
||||
ResMapper res(_res, resStride, resIncr);
|
||||
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel1;
|
||||
gebp_kernel<LhsScalar, RhsScalar, Index, BufferMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel2;
|
||||
|
||||
Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer((internal::constructor_without_unaligned_array_assert()));
|
||||
|
||||
@@ -158,31 +162,32 @@ struct tribb_kernel
|
||||
const RhsScalar* actual_b = blockB+j*depth;
|
||||
|
||||
if(UpLo==Upper)
|
||||
gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
|
||||
-1, -1, 0, 0);
|
||||
gebp_kernel1(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
|
||||
-1, -1, 0, 0);
|
||||
|
||||
// selfadjoint micro block
|
||||
{
|
||||
Index i = j;
|
||||
buffer.setZero();
|
||||
// 1 - apply the kernel on the temporary buffer
|
||||
gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
|
||||
-1, -1, 0, 0);
|
||||
gebp_kernel2(BufferMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
|
||||
-1, -1, 0, 0);
|
||||
|
||||
// 2 - triangular accumulation
|
||||
for(Index j1=0; j1<actualBlockSize; ++j1)
|
||||
{
|
||||
ResScalar* r = &res(i, j + j1);
|
||||
typename ResMapper::LinearMapper r = res.getLinearMapper(i,j+j1);
|
||||
for(Index i1=UpLo==Lower ? j1 : 0;
|
||||
UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
|
||||
r[i1] += buffer(i1,j1);
|
||||
r(i1) += buffer(i1,j1);
|
||||
}
|
||||
}
|
||||
|
||||
if(UpLo==Lower)
|
||||
{
|
||||
Index i = j+actualBlockSize;
|
||||
gebp_kernel(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i,
|
||||
depth, actualBlockSize, alpha, -1, -1, 0, 0);
|
||||
gebp_kernel1(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i,
|
||||
depth, actualBlockSize, alpha, -1, -1, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -202,13 +207,13 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
|
||||
static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha, bool beta)
|
||||
{
|
||||
typedef typename MatrixType::Scalar Scalar;
|
||||
|
||||
|
||||
typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
|
||||
typedef internal::blas_traits<Lhs> LhsBlasTraits;
|
||||
typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs;
|
||||
typedef typename internal::remove_all<ActualLhs>::type _ActualLhs;
|
||||
typename internal::add_const_on_value_type<ActualLhs>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
|
||||
|
||||
|
||||
typedef typename internal::remove_all<typename ProductType::RhsNested>::type Rhs;
|
||||
typedef internal::blas_traits<Rhs> RhsBlasTraits;
|
||||
typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs;
|
||||
@@ -225,18 +230,18 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
|
||||
UseLhsDirectly = _ActualLhs::InnerStrideAtCompileTime==1,
|
||||
UseRhsDirectly = _ActualRhs::InnerStrideAtCompileTime==1
|
||||
};
|
||||
|
||||
|
||||
internal::gemv_static_vector_if<Scalar,Lhs::SizeAtCompileTime,Lhs::MaxSizeAtCompileTime,!UseLhsDirectly> static_lhs;
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, actualLhsPtr, actualLhs.size(),
|
||||
(UseLhsDirectly ? const_cast<Scalar*>(actualLhs.data()) : static_lhs.data()));
|
||||
if(!UseLhsDirectly) Map<typename _ActualLhs::PlainObject>(actualLhsPtr, actualLhs.size()) = actualLhs;
|
||||
|
||||
|
||||
internal::gemv_static_vector_if<Scalar,Rhs::SizeAtCompileTime,Rhs::MaxSizeAtCompileTime,!UseRhsDirectly> static_rhs;
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, actualRhsPtr, actualRhs.size(),
|
||||
(UseRhsDirectly ? const_cast<Scalar*>(actualRhs.data()) : static_rhs.data()));
|
||||
if(!UseRhsDirectly) Map<typename _ActualRhs::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
|
||||
|
||||
|
||||
|
||||
|
||||
selfadjoint_rank1_update<Scalar,Index,StorageOrder,UpLo,
|
||||
LhsBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex,
|
||||
RhsBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex>
|
||||
@@ -254,7 +259,7 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
|
||||
typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs;
|
||||
typedef typename internal::remove_all<ActualLhs>::type _ActualLhs;
|
||||
typename internal::add_const_on_value_type<ActualLhs>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
|
||||
|
||||
|
||||
typedef typename internal::remove_all<typename ProductType::RhsNested>::type Rhs;
|
||||
typedef internal::blas_traits<Rhs> RhsBlasTraits;
|
||||
typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs;
|
||||
@@ -286,23 +291,24 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
|
||||
internal::general_matrix_matrix_triangular_product<Index,
|
||||
typename Lhs::Scalar, LhsIsRowMajor ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
|
||||
typename Rhs::Scalar, RhsIsRowMajor ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
|
||||
IsRowMajor ? RowMajor : ColMajor, UpLo&(Lower|Upper)>
|
||||
IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo&(Lower|Upper)>
|
||||
::run(size, depth,
|
||||
&actualLhs.coeffRef(SkipDiag&&(UpLo&Lower)==Lower ? 1 : 0,0), actualLhs.outerStride(),
|
||||
&actualRhs.coeffRef(0,SkipDiag&&(UpLo&Upper)==Upper ? 1 : 0), actualRhs.outerStride(),
|
||||
mat.data() + (SkipDiag ? (bool(IsRowMajor) != ((UpLo&Lower)==Lower) ? 1 : mat.outerStride() ) : 0), mat.outerStride(), actualAlpha, blocking);
|
||||
mat.data() + (SkipDiag ? (bool(IsRowMajor) != ((UpLo&Lower)==Lower) ? mat.innerStride() : mat.outerStride() ) : 0),
|
||||
mat.innerStride(), mat.outerStride(), actualAlpha, blocking);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename MatrixType, unsigned int UpLo>
|
||||
template<typename _MatrixType, unsigned int _Mode>
|
||||
template<typename ProductType>
|
||||
TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
|
||||
EIGEN_DEVICE_FUNC TriangularView<_MatrixType,_Mode>& TriangularViewImpl<_MatrixType,_Mode,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED);
|
||||
EIGEN_STATIC_ASSERT((_Mode&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED);
|
||||
eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols());
|
||||
|
||||
general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta);
|
||||
|
||||
|
||||
general_product_to_triangular_selector<_MatrixType, ProductType, _Mode, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta);
|
||||
|
||||
return derived();
|
||||
}
|
||||
|
||||
|
||||
@@ -40,7 +40,7 @@ namespace internal {
|
||||
template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int UpLo>
|
||||
struct general_matrix_matrix_rankupdate :
|
||||
general_matrix_matrix_triangular_product<
|
||||
Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,UpLo,BuiltIn> {};
|
||||
Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,1,UpLo,BuiltIn> {};
|
||||
|
||||
|
||||
// try to go to BLAS specialization
|
||||
@@ -48,9 +48,9 @@ struct general_matrix_matrix_rankupdate :
|
||||
template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs, int UpLo> \
|
||||
struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \
|
||||
Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Specialized> { \
|
||||
Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,1,UpLo,Specialized> { \
|
||||
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
|
||||
const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \
|
||||
const Scalar* rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \
|
||||
{ \
|
||||
if ( lhs==rhs && ((UpLo&(Lower|Upper))==UpLo) ) { \
|
||||
general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
|
||||
@@ -59,8 +59,8 @@ struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,Con
|
||||
general_matrix_matrix_triangular_product<Index, \
|
||||
Scalar, LhsStorageOrder, ConjugateLhs, \
|
||||
Scalar, RhsStorageOrder, ConjugateRhs, \
|
||||
ColMajor, UpLo, BuiltIn> \
|
||||
::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \
|
||||
ColMajor, 1, UpLo, BuiltIn> \
|
||||
::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resIncr,resStride,alpha,blocking); \
|
||||
} \
|
||||
} \
|
||||
};
|
||||
|
||||
@@ -51,20 +51,22 @@ template< \
|
||||
typename Index, \
|
||||
int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs> \
|
||||
struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \
|
||||
struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1> \
|
||||
{ \
|
||||
typedef gebp_traits<EIGTYPE,EIGTYPE> Traits; \
|
||||
\
|
||||
static void run(Index rows, Index cols, Index depth, \
|
||||
const EIGTYPE* _lhs, Index lhsStride, \
|
||||
const EIGTYPE* _rhs, Index rhsStride, \
|
||||
EIGTYPE* res, Index resStride, \
|
||||
EIGTYPE* res, Index resIncr, Index resStride, \
|
||||
EIGTYPE alpha, \
|
||||
level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, \
|
||||
GemmParallelInfo<Index>* /*info = 0*/) \
|
||||
{ \
|
||||
using std::conj; \
|
||||
\
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
|
||||
eigen_assert(resIncr == 1); \
|
||||
char transa, transb; \
|
||||
BlasIndex m, n, k, lda, ldb, ldc; \
|
||||
const EIGTYPE *a, *b; \
|
||||
|
||||
@@ -17,7 +17,8 @@ namespace internal {
|
||||
/** \internal */
|
||||
inline void manage_multi_threading(Action action, int* v)
|
||||
{
|
||||
static EIGEN_UNUSED int m_maxThreads = -1;
|
||||
static int m_maxThreads = -1;
|
||||
EIGEN_UNUSED_VARIABLE(m_maxThreads);
|
||||
|
||||
if(action==SetAction)
|
||||
{
|
||||
@@ -150,8 +151,10 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
|
||||
info[i].lhs_start = r0;
|
||||
info[i].lhs_length = actualBlockRows;
|
||||
|
||||
if(transpose) func(c0, actualBlockCols, 0, rows, info);
|
||||
else func(0, rows, c0, actualBlockCols, info);
|
||||
if(transpose)
|
||||
func(c0, actualBlockCols, 0, rows, info);
|
||||
else
|
||||
func(0, rows, c0, actualBlockCols, info);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -277,20 +277,21 @@ struct symm_pack_rhs
|
||||
template <typename Scalar, typename Index,
|
||||
int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs,
|
||||
int ResStorageOrder>
|
||||
int ResStorageOrder, int ResInnerStride>
|
||||
struct product_selfadjoint_matrix;
|
||||
|
||||
template <typename Scalar, typename Index,
|
||||
int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs>
|
||||
struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor>
|
||||
int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs,
|
||||
int ResInnerStride>
|
||||
struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor,ResInnerStride>
|
||||
{
|
||||
|
||||
static EIGEN_STRONG_INLINE void run(
|
||||
Index rows, Index cols,
|
||||
const Scalar* lhs, Index lhsStride,
|
||||
const Scalar* rhs, Index rhsStride,
|
||||
Scalar* res, Index resStride,
|
||||
Scalar* res, Index resIncr, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
product_selfadjoint_matrix<Scalar, Index,
|
||||
@@ -298,33 +299,35 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,Co
|
||||
RhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs),
|
||||
EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
|
||||
LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs),
|
||||
ColMajor>
|
||||
::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking);
|
||||
ColMajor,ResInnerStride>
|
||||
::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Scalar, typename Index,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs>
|
||||
struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor>
|
||||
int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride>
|
||||
struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride>
|
||||
{
|
||||
|
||||
static EIGEN_DONT_INLINE void run(
|
||||
Index rows, Index cols,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* res, Index resStride,
|
||||
Scalar* res, Index resIncr, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
|
||||
};
|
||||
|
||||
template <typename Scalar, typename Index,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs>
|
||||
EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor>::run(
|
||||
int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride>
|
||||
EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride>::run(
|
||||
Index rows, Index cols,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* _res, Index resStride,
|
||||
Scalar* _res, Index resIncr, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
Index size = rows;
|
||||
@@ -334,11 +337,11 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
|
||||
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
|
||||
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
|
||||
RhsMapper rhs(_rhs,rhsStride);
|
||||
ResMapper res(_res, resStride);
|
||||
ResMapper res(_res, resStride, resIncr);
|
||||
|
||||
Index kc = blocking.kc(); // cache block size along the K direction
|
||||
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
|
||||
@@ -398,26 +401,28 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
|
||||
// matrix * selfadjoint product
|
||||
template <typename Scalar, typename Index,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs>
|
||||
struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor>
|
||||
int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride>
|
||||
struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride>
|
||||
{
|
||||
|
||||
static EIGEN_DONT_INLINE void run(
|
||||
Index rows, Index cols,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* res, Index resStride,
|
||||
Scalar* res, Index resIncr, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
|
||||
};
|
||||
|
||||
template <typename Scalar, typename Index,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs>
|
||||
EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor>::run(
|
||||
int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride>
|
||||
EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride>::run(
|
||||
Index rows, Index cols,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* _res, Index resStride,
|
||||
Scalar* _res, Index resIncr, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
Index size = cols;
|
||||
@@ -425,9 +430,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
|
||||
typedef gebp_traits<Scalar,Scalar> Traits;
|
||||
|
||||
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
ResMapper res(_res,resStride);
|
||||
ResMapper res(_res,resStride, resIncr);
|
||||
|
||||
Index kc = blocking.kc(); // cache block size along the K direction
|
||||
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
|
||||
@@ -503,12 +508,13 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,RhsMode,false>
|
||||
NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)),
|
||||
EIGEN_LOGICAL_XOR(RhsIsUpper,internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
|
||||
NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
|
||||
internal::traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor>
|
||||
internal::traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor,
|
||||
Dest::InnerStrideAtCompileTime>
|
||||
::run(
|
||||
lhs.rows(), rhs.cols(), // sizes
|
||||
&lhs.coeffRef(0,0), lhs.outerStride(), // lhs info
|
||||
&rhs.coeffRef(0,0), rhs.outerStride(), // rhs info
|
||||
&dst.coeffRef(0,0), dst.outerStride(), // result info
|
||||
&dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(), // result info
|
||||
actualAlpha, blocking // alpha
|
||||
);
|
||||
}
|
||||
|
||||
@@ -44,16 +44,18 @@ namespace internal {
|
||||
template <typename Index, \
|
||||
int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs> \
|
||||
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \
|
||||
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor,1> \
|
||||
{\
|
||||
\
|
||||
static void run( \
|
||||
Index rows, Index cols, \
|
||||
const EIGTYPE* _lhs, Index lhsStride, \
|
||||
const EIGTYPE* _rhs, Index rhsStride, \
|
||||
EIGTYPE* res, Index resStride, \
|
||||
EIGTYPE* res, Index resIncr, Index resStride, \
|
||||
EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
|
||||
{ \
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
|
||||
eigen_assert(resIncr == 1); \
|
||||
char side='L', uplo='L'; \
|
||||
BlasIndex m, n, lda, ldb, ldc; \
|
||||
const EIGTYPE *a, *b; \
|
||||
@@ -91,15 +93,17 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
|
||||
template <typename Index, \
|
||||
int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs> \
|
||||
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \
|
||||
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor,1> \
|
||||
{\
|
||||
static void run( \
|
||||
Index rows, Index cols, \
|
||||
const EIGTYPE* _lhs, Index lhsStride, \
|
||||
const EIGTYPE* _rhs, Index rhsStride, \
|
||||
EIGTYPE* res, Index resStride, \
|
||||
EIGTYPE* res, Index resIncr, Index resStride, \
|
||||
EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
|
||||
{ \
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
|
||||
eigen_assert(resIncr == 1); \
|
||||
char side='L', uplo='L'; \
|
||||
BlasIndex m, n, lda, ldb, ldc; \
|
||||
const EIGTYPE *a, *b; \
|
||||
@@ -167,16 +171,18 @@ EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_)
|
||||
template <typename Index, \
|
||||
int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs> \
|
||||
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \
|
||||
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor,1> \
|
||||
{\
|
||||
\
|
||||
static void run( \
|
||||
Index rows, Index cols, \
|
||||
const EIGTYPE* _lhs, Index lhsStride, \
|
||||
const EIGTYPE* _rhs, Index rhsStride, \
|
||||
EIGTYPE* res, Index resStride, \
|
||||
EIGTYPE* res, Index resIncr, Index resStride, \
|
||||
EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
|
||||
{ \
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
|
||||
eigen_assert(resIncr == 1); \
|
||||
char side='R', uplo='L'; \
|
||||
BlasIndex m, n, lda, ldb, ldc; \
|
||||
const EIGTYPE *a, *b; \
|
||||
@@ -213,15 +219,17 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
|
||||
template <typename Index, \
|
||||
int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs> \
|
||||
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \
|
||||
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor,1> \
|
||||
{\
|
||||
static void run( \
|
||||
Index rows, Index cols, \
|
||||
const EIGTYPE* _lhs, Index lhsStride, \
|
||||
const EIGTYPE* _rhs, Index rhsStride, \
|
||||
EIGTYPE* res, Index resStride, \
|
||||
EIGTYPE* res, Index resIncr, Index resStride, \
|
||||
EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
|
||||
{ \
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
|
||||
eigen_assert(resIncr == 1); \
|
||||
char side='R', uplo='L'; \
|
||||
BlasIndex m, n, lda, ldb, ldc; \
|
||||
const EIGTYPE *a, *b; \
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
* It corresponds to the level 3 SYRK and level 2 SYR Blas routines.
|
||||
**********************************************************************/
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
|
||||
template<typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
|
||||
@@ -68,10 +68,10 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,true>
|
||||
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, actualOtherPtr, other.size(),
|
||||
(UseOtherDirectly ? const_cast<Scalar*>(actualOther.data()) : static_other.data()));
|
||||
|
||||
|
||||
if(!UseOtherDirectly)
|
||||
Map<typename _ActualOtherType::PlainObject>(actualOtherPtr, actualOther.size()) = actualOther;
|
||||
|
||||
|
||||
selfadjoint_rank1_update<Scalar,Index,StorageOrder,UpLo,
|
||||
OtherBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex,
|
||||
(!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex>
|
||||
@@ -109,10 +109,10 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
|
||||
internal::general_matrix_matrix_triangular_product<Index,
|
||||
Scalar, OtherIsRowMajor ? RowMajor : ColMajor, OtherBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex,
|
||||
Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex,
|
||||
IsRowMajor ? RowMajor : ColMajor, UpLo>
|
||||
IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo>
|
||||
::run(size, depth,
|
||||
&actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(),
|
||||
mat.data(), mat.outerStride(), actualAlpha, blocking);
|
||||
mat.data(), mat.innerStride(), mat.outerStride(), actualAlpha, blocking);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -120,7 +120,7 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
|
||||
|
||||
template<typename MatrixType, unsigned int UpLo>
|
||||
template<typename DerivedU>
|
||||
SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
|
||||
EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
|
||||
::rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha)
|
||||
{
|
||||
selfadjoint_product_selector<MatrixType,DerivedU,UpLo>::run(_expression().const_cast_derived(), u.derived(), alpha);
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#ifndef EIGEN_SELFADJOINTRANK2UPTADE_H
|
||||
#define EIGEN_SELFADJOINTRANK2UPTADE_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
@@ -57,7 +57,7 @@ template<bool Cond, typename T> struct conj_expr_if
|
||||
|
||||
template<typename MatrixType, unsigned int UpLo>
|
||||
template<typename DerivedU, typename DerivedV>
|
||||
SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
|
||||
EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
|
||||
::rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha)
|
||||
{
|
||||
typedef internal::blas_traits<DerivedU> UBlasTraits;
|
||||
|
||||
@@ -45,22 +45,24 @@ template <typename Scalar, typename Index,
|
||||
int Mode, bool LhsIsTriangular,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResStorageOrder, int Version = Specialized>
|
||||
int ResStorageOrder, int ResInnerStride,
|
||||
int Version = Specialized>
|
||||
struct product_triangular_matrix_matrix;
|
||||
|
||||
template <typename Scalar, typename Index,
|
||||
int Mode, bool LhsIsTriangular,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs, int Version>
|
||||
int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride, int Version>
|
||||
struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular,
|
||||
LhsStorageOrder,ConjugateLhs,
|
||||
RhsStorageOrder,ConjugateRhs,RowMajor,Version>
|
||||
RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride,Version>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(
|
||||
Index rows, Index cols, Index depth,
|
||||
const Scalar* lhs, Index lhsStride,
|
||||
const Scalar* rhs, Index rhsStride,
|
||||
Scalar* res, Index resStride,
|
||||
Scalar* res, Index resIncr, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
product_triangular_matrix_matrix<Scalar, Index,
|
||||
@@ -70,18 +72,19 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular,
|
||||
ConjugateRhs,
|
||||
LhsStorageOrder==RowMajor ? ColMajor : RowMajor,
|
||||
ConjugateLhs,
|
||||
ColMajor>
|
||||
::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking);
|
||||
ColMajor, ResInnerStride>
|
||||
::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking);
|
||||
}
|
||||
};
|
||||
|
||||
// implements col-major += alpha * op(triangular) * op(general)
|
||||
template <typename Scalar, typename Index, int Mode,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs, int Version>
|
||||
int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride, int Version>
|
||||
struct product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
LhsStorageOrder,ConjugateLhs,
|
||||
RhsStorageOrder,ConjugateRhs,ColMajor,Version>
|
||||
RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>
|
||||
{
|
||||
|
||||
typedef gebp_traits<Scalar,Scalar> Traits;
|
||||
@@ -95,20 +98,21 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
Index _rows, Index _cols, Index _depth,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* res, Index resStride,
|
||||
Scalar* res, Index resIncr, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
|
||||
};
|
||||
|
||||
template <typename Scalar, typename Index, int Mode,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs, int Version>
|
||||
int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride, int Version>
|
||||
EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
LhsStorageOrder,ConjugateLhs,
|
||||
RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run(
|
||||
RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run(
|
||||
Index _rows, Index _cols, Index _depth,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* _res, Index resStride,
|
||||
Scalar* _res, Index resIncr, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
// strip zeros
|
||||
@@ -119,10 +123,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
|
||||
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
RhsMapper rhs(_rhs,rhsStride);
|
||||
ResMapper res(_res, resStride);
|
||||
ResMapper res(_res, resStride, resIncr);
|
||||
|
||||
Index kc = blocking.kc(); // cache block size along the K direction
|
||||
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
|
||||
@@ -235,10 +239,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
// implements col-major += alpha * op(general) * op(triangular)
|
||||
template <typename Scalar, typename Index, int Mode,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs, int Version>
|
||||
int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride, int Version>
|
||||
struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
LhsStorageOrder,ConjugateLhs,
|
||||
RhsStorageOrder,ConjugateRhs,ColMajor,Version>
|
||||
RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>
|
||||
{
|
||||
typedef gebp_traits<Scalar,Scalar> Traits;
|
||||
enum {
|
||||
@@ -251,20 +256,21 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
Index _rows, Index _cols, Index _depth,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* res, Index resStride,
|
||||
Scalar* res, Index resIncr, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
|
||||
};
|
||||
|
||||
template <typename Scalar, typename Index, int Mode,
|
||||
int LhsStorageOrder, bool ConjugateLhs,
|
||||
int RhsStorageOrder, bool ConjugateRhs, int Version>
|
||||
int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResInnerStride, int Version>
|
||||
EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
LhsStorageOrder,ConjugateLhs,
|
||||
RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run(
|
||||
RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run(
|
||||
Index _rows, Index _cols, Index _depth,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* _res, Index resStride,
|
||||
Scalar* _res, Index resIncr, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
const Index PacketBytes = packet_traits<Scalar>::size*sizeof(Scalar);
|
||||
@@ -276,10 +282,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
|
||||
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
RhsMapper rhs(_rhs,rhsStride);
|
||||
ResMapper res(_res, resStride);
|
||||
ResMapper res(_res, resStride, resIncr);
|
||||
|
||||
Index kc = blocking.kc(); // cache block size along the K direction
|
||||
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
|
||||
@@ -433,12 +439,12 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
|
||||
Mode, LhsIsTriangular,
|
||||
(internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
|
||||
(internal::traits<ActualRhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
|
||||
(internal::traits<Dest >::Flags&RowMajorBit) ? RowMajor : ColMajor>
|
||||
(internal::traits<Dest >::Flags&RowMajorBit) ? RowMajor : ColMajor, Dest::InnerStrideAtCompileTime>
|
||||
::run(
|
||||
stripedRows, stripedCols, stripedDepth, // sizes
|
||||
&lhs.coeffRef(0,0), lhs.outerStride(), // lhs info
|
||||
&rhs.coeffRef(0,0), rhs.outerStride(), // rhs info
|
||||
&dst.coeffRef(0,0), dst.outerStride(), // result info
|
||||
&dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(), // result info
|
||||
actualAlpha, blocking
|
||||
);
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@ template <typename Scalar, typename Index,
|
||||
struct product_triangular_matrix_matrix_trmm :
|
||||
product_triangular_matrix_matrix<Scalar,Index,Mode,
|
||||
LhsIsTriangular,LhsStorageOrder,ConjugateLhs,
|
||||
RhsStorageOrder, ConjugateRhs, ResStorageOrder, BuiltIn> {};
|
||||
RhsStorageOrder, ConjugateRhs, ResStorageOrder, 1, BuiltIn> {};
|
||||
|
||||
|
||||
// try to go to BLAS specialization
|
||||
@@ -55,13 +55,15 @@ template <typename Index, int Mode, \
|
||||
int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs> \
|
||||
struct product_triangular_matrix_matrix<Scalar,Index, Mode, LhsIsTriangular, \
|
||||
LhsStorageOrder,ConjugateLhs, RhsStorageOrder,ConjugateRhs,ColMajor,Specialized> { \
|
||||
LhsStorageOrder,ConjugateLhs, RhsStorageOrder,ConjugateRhs,ColMajor,1,Specialized> { \
|
||||
static inline void run(Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride,\
|
||||
const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar,Scalar>& blocking) { \
|
||||
const Scalar* _rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking<Scalar,Scalar>& blocking) { \
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
|
||||
eigen_assert(resIncr == 1); \
|
||||
product_triangular_matrix_matrix_trmm<Scalar,Index,Mode, \
|
||||
LhsIsTriangular,LhsStorageOrder,ConjugateLhs, \
|
||||
RhsStorageOrder, ConjugateRhs, ColMajor>::run( \
|
||||
_rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
|
||||
_rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
|
||||
} \
|
||||
};
|
||||
|
||||
@@ -115,8 +117,8 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
|
||||
if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \
|
||||
/* Most likely no benefit to call TRMM or GEMM from BLAS */ \
|
||||
product_triangular_matrix_matrix<EIGTYPE,Index,Mode,true, \
|
||||
LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \
|
||||
_rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
|
||||
LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \
|
||||
_rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \
|
||||
/*std::cout << "TRMM_L: A is not square! Go to Eigen TRMM implementation!\n";*/ \
|
||||
} else { \
|
||||
/* Make sense to call GEMM */ \
|
||||
@@ -124,8 +126,8 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
|
||||
MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \
|
||||
BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \
|
||||
gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
|
||||
general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
|
||||
rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \
|
||||
general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1>::run( \
|
||||
rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, 1, resStride, alpha, gemm_blocking, 0); \
|
||||
\
|
||||
/*std::cout << "TRMM_L: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \
|
||||
} \
|
||||
@@ -232,8 +234,8 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
|
||||
if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \
|
||||
/* Most likely no benefit to call TRMM or GEMM from BLAS*/ \
|
||||
product_triangular_matrix_matrix<EIGTYPE,Index,Mode,false, \
|
||||
LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \
|
||||
_rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
|
||||
LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \
|
||||
_rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \
|
||||
/*std::cout << "TRMM_R: A is not square! Go to Eigen TRMM implementation!\n";*/ \
|
||||
} else { \
|
||||
/* Make sense to call GEMM */ \
|
||||
@@ -241,8 +243,8 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
|
||||
MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \
|
||||
BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \
|
||||
gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
|
||||
general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
|
||||
rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \
|
||||
general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1>::run( \
|
||||
rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, 1, resStride, alpha, gemm_blocking, 0); \
|
||||
\
|
||||
/*std::cout << "TRMM_R: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \
|
||||
} \
|
||||
|
||||
@@ -15,48 +15,48 @@ namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// if the rhs is row major, let's transpose the product
|
||||
template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder>
|
||||
struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor>
|
||||
template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
|
||||
struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor,OtherInnerStride>
|
||||
{
|
||||
static void run(
|
||||
Index size, Index cols,
|
||||
const Scalar* tri, Index triStride,
|
||||
Scalar* _other, Index otherStride,
|
||||
Scalar* _other, Index otherIncr, Index otherStride,
|
||||
level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
triangular_solve_matrix<
|
||||
Scalar, Index, Side==OnTheLeft?OnTheRight:OnTheLeft,
|
||||
(Mode&UnitDiag) | ((Mode&Upper) ? Lower : Upper),
|
||||
NumTraits<Scalar>::IsComplex && Conjugate,
|
||||
TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor>
|
||||
::run(size, cols, tri, triStride, _other, otherStride, blocking);
|
||||
TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor, OtherInnerStride>
|
||||
::run(size, cols, tri, triStride, _other, otherIncr, otherStride, blocking);
|
||||
}
|
||||
};
|
||||
|
||||
/* Optimized triangular solver with multiple right hand side and the triangular matrix on the left
|
||||
*/
|
||||
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
|
||||
struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>
|
||||
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder,int OtherInnerStride>
|
||||
struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>
|
||||
{
|
||||
static EIGEN_DONT_INLINE void run(
|
||||
Index size, Index otherSize,
|
||||
const Scalar* _tri, Index triStride,
|
||||
Scalar* _other, Index otherStride,
|
||||
Scalar* _other, Index otherIncr, Index otherStride,
|
||||
level3_blocking<Scalar,Scalar>& blocking);
|
||||
};
|
||||
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
|
||||
EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>::run(
|
||||
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
|
||||
EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>::run(
|
||||
Index size, Index otherSize,
|
||||
const Scalar* _tri, Index triStride,
|
||||
Scalar* _other, Index otherStride,
|
||||
Scalar* _other, Index otherIncr, Index otherStride,
|
||||
level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
Index cols = otherSize;
|
||||
|
||||
typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper;
|
||||
typedef blas_data_mapper<Scalar, Index, ColMajor> OtherMapper;
|
||||
typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> OtherMapper;
|
||||
TriMapper tri(_tri, triStride);
|
||||
OtherMapper other(_other, otherStride);
|
||||
OtherMapper other(_other, otherStride, otherIncr);
|
||||
|
||||
typedef gebp_traits<Scalar,Scalar> Traits;
|
||||
|
||||
@@ -128,19 +128,19 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
||||
{
|
||||
Scalar b(0);
|
||||
const Scalar* l = &tri(i,s);
|
||||
Scalar* r = &other(s,j);
|
||||
typename OtherMapper::LinearMapper r = other.getLinearMapper(s,j);
|
||||
for (Index i3=0; i3<k; ++i3)
|
||||
b += conj(l[i3]) * r[i3];
|
||||
b += conj(l[i3]) * r(i3);
|
||||
|
||||
other(i,j) = (other(i,j) - b)*a;
|
||||
}
|
||||
else
|
||||
{
|
||||
Scalar b = (other(i,j) *= a);
|
||||
Scalar* r = &other(s,j);
|
||||
const Scalar* l = &tri(s,i);
|
||||
typename OtherMapper::LinearMapper r = other.getLinearMapper(s,j);
|
||||
typename TriMapper::LinearMapper l = tri.getLinearMapper(s,i);
|
||||
for (Index i3=0;i3<rs;++i3)
|
||||
r[i3] -= b * conj(l[i3]);
|
||||
r(i3) -= b * conj(l(i3));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -185,28 +185,28 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
||||
|
||||
/* Optimized triangular solver with multiple left hand sides and the triangular matrix on the right
|
||||
*/
|
||||
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
|
||||
struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>
|
||||
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
|
||||
struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>
|
||||
{
|
||||
static EIGEN_DONT_INLINE void run(
|
||||
Index size, Index otherSize,
|
||||
const Scalar* _tri, Index triStride,
|
||||
Scalar* _other, Index otherStride,
|
||||
Scalar* _other, Index otherIncr, Index otherStride,
|
||||
level3_blocking<Scalar,Scalar>& blocking);
|
||||
};
|
||||
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
|
||||
EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>::run(
|
||||
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
|
||||
EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>::run(
|
||||
Index size, Index otherSize,
|
||||
const Scalar* _tri, Index triStride,
|
||||
Scalar* _other, Index otherStride,
|
||||
Scalar* _other, Index otherIncr, Index otherStride,
|
||||
level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
Index rows = otherSize;
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
|
||||
typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper;
|
||||
typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> LhsMapper;
|
||||
typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
|
||||
LhsMapper lhs(_other, otherStride);
|
||||
LhsMapper lhs(_other, otherStride, otherIncr);
|
||||
RhsMapper rhs(_tri, triStride);
|
||||
|
||||
typedef gebp_traits<Scalar,Scalar> Traits;
|
||||
@@ -297,24 +297,24 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
|
||||
{
|
||||
Index j = IsLower ? absolute_j2+actualPanelWidth-k-1 : absolute_j2+k;
|
||||
|
||||
Scalar* r = &lhs(i2,j);
|
||||
typename LhsMapper::LinearMapper r = lhs.getLinearMapper(i2,j);
|
||||
for (Index k3=0; k3<k; ++k3)
|
||||
{
|
||||
Scalar b = conj(rhs(IsLower ? j+1+k3 : absolute_j2+k3,j));
|
||||
Scalar* a = &lhs(i2,IsLower ? j+1+k3 : absolute_j2+k3);
|
||||
typename LhsMapper::LinearMapper a = lhs.getLinearMapper(i2,IsLower ? j+1+k3 : absolute_j2+k3);
|
||||
for (Index i=0; i<actual_mc; ++i)
|
||||
r[i] -= a[i] * b;
|
||||
r(i) -= a(i) * b;
|
||||
}
|
||||
if((Mode & UnitDiag)==0)
|
||||
{
|
||||
Scalar inv_rjj = RealScalar(1)/conj(rhs(j,j));
|
||||
for (Index i=0; i<actual_mc; ++i)
|
||||
r[i] *= inv_rjj;
|
||||
r(i) *= inv_rjj;
|
||||
}
|
||||
}
|
||||
|
||||
// pack the just computed part of lhs to A
|
||||
pack_lhs_panel(blockA, LhsMapper(_other+absolute_j2*otherStride+i2, otherStride),
|
||||
pack_lhs_panel(blockA, lhs.getSubMapper(i2,absolute_j2),
|
||||
actualPanelWidth, actual_mc,
|
||||
actual_kc, j2);
|
||||
}
|
||||
|
||||
@@ -40,7 +40,7 @@ namespace internal {
|
||||
// implements LeftSide op(triangular)^-1 * general
|
||||
#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASFUNC) \
|
||||
template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
|
||||
struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> \
|
||||
struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,1> \
|
||||
{ \
|
||||
enum { \
|
||||
IsLower = (Mode&Lower) == Lower, \
|
||||
@@ -51,8 +51,10 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage
|
||||
static void run( \
|
||||
Index size, Index otherSize, \
|
||||
const EIGTYPE* _tri, Index triStride, \
|
||||
EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
|
||||
EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
|
||||
{ \
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \
|
||||
eigen_assert(otherIncr == 1); \
|
||||
BlasIndex m = convert_index<BlasIndex>(size), n = convert_index<BlasIndex>(otherSize), lda, ldb; \
|
||||
char side = 'L', uplo, diag='N', transa; \
|
||||
/* Set alpha_ */ \
|
||||
@@ -99,7 +101,7 @@ EIGEN_BLAS_TRSM_L(scomplex, float, ctrsm_)
|
||||
// implements RightSide general * op(triangular)^-1
|
||||
#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASFUNC) \
|
||||
template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
|
||||
struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> \
|
||||
struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,1> \
|
||||
{ \
|
||||
enum { \
|
||||
IsLower = (Mode&Lower) == Lower, \
|
||||
@@ -110,8 +112,10 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag
|
||||
static void run( \
|
||||
Index size, Index otherSize, \
|
||||
const EIGTYPE* _tri, Index triStride, \
|
||||
EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
|
||||
EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
|
||||
{ \
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \
|
||||
eigen_assert(otherIncr == 1); \
|
||||
BlasIndex m = convert_index<BlasIndex>(otherSize), n = convert_index<BlasIndex>(size), lda, ldb; \
|
||||
char side = 'R', uplo, diag='N', transa; \
|
||||
/* Set alpha_ */ \
|
||||
|
||||
@@ -31,7 +31,7 @@ template<
|
||||
typename Index,
|
||||
typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
|
||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
|
||||
int ResStorageOrder>
|
||||
int ResStorageOrder, int ResInnerStride>
|
||||
struct general_matrix_matrix_product;
|
||||
|
||||
template<typename Index,
|
||||
@@ -155,13 +155,21 @@ class BlasVectorMapper {
|
||||
Scalar* m_data;
|
||||
};
|
||||
|
||||
template<typename Scalar, typename Index, int AlignmentType, int Incr=1>
|
||||
class BlasLinearMapper;
|
||||
|
||||
template<typename Scalar, typename Index, int AlignmentType>
|
||||
class BlasLinearMapper {
|
||||
class BlasLinearMapper<Scalar,Index,AlignmentType,1> {
|
||||
public:
|
||||
typedef typename packet_traits<Scalar>::type Packet;
|
||||
typedef typename packet_traits<Scalar>::half HalfPacket;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {}
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data, Index incr=1)
|
||||
: m_data(data)
|
||||
{
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(incr);
|
||||
eigen_assert(incr==1);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
|
||||
internal::prefetch(&operator()(i));
|
||||
@@ -188,16 +196,25 @@ class BlasLinearMapper {
|
||||
};
|
||||
|
||||
// Lightweight helper class to access matrix coefficients.
|
||||
template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned>
|
||||
class blas_data_mapper {
|
||||
public:
|
||||
template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned, int Incr = 1>
|
||||
class blas_data_mapper;
|
||||
|
||||
template<typename Scalar, typename Index, int StorageOrder, int AlignmentType>
|
||||
class blas_data_mapper<Scalar,Index,StorageOrder,AlignmentType,1>
|
||||
{
|
||||
public:
|
||||
typedef typename packet_traits<Scalar>::type Packet;
|
||||
typedef typename packet_traits<Scalar>::half HalfPacket;
|
||||
|
||||
typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
|
||||
typedef BlasVectorMapper<Scalar, Index> VectorMapper;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr=1)
|
||||
: m_data(data), m_stride(stride)
|
||||
{
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(incr);
|
||||
eigen_assert(incr==1);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>
|
||||
getSubMapper(Index i, Index j) const {
|
||||
@@ -251,6 +268,90 @@ class blas_data_mapper {
|
||||
const Index m_stride;
|
||||
};
|
||||
|
||||
// Implementation of non-natural increment (i.e. inner-stride != 1)
|
||||
// The exposed API is not complete yet compared to the Incr==1 case
|
||||
// because some features makes less sense in this case.
|
||||
template<typename Scalar, typename Index, int AlignmentType, int Incr>
|
||||
class BlasLinearMapper
|
||||
{
|
||||
public:
|
||||
typedef typename packet_traits<Scalar>::type Packet;
|
||||
typedef typename packet_traits<Scalar>::half HalfPacket;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data,Index incr) : m_data(data), m_incr(incr) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
|
||||
internal::prefetch(&operator()(i));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
|
||||
return m_data[i*m_incr.value()];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
|
||||
return pgather<Scalar,Packet>(m_data + i*m_incr.value(), m_incr.value());
|
||||
}
|
||||
|
||||
template<typename PacketType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const {
|
||||
pscatter<Scalar, PacketType>(m_data + i*m_incr.value(), p, m_incr.value());
|
||||
}
|
||||
|
||||
protected:
|
||||
Scalar *m_data;
|
||||
const internal::variable_if_dynamic<Index,Incr> m_incr;
|
||||
};
|
||||
|
||||
template<typename Scalar, typename Index, int StorageOrder, int AlignmentType,int Incr>
|
||||
class blas_data_mapper
|
||||
{
|
||||
public:
|
||||
typedef typename packet_traits<Scalar>::type Packet;
|
||||
typedef typename packet_traits<Scalar>::half HalfPacket;
|
||||
|
||||
typedef BlasLinearMapper<Scalar, Index, AlignmentType,Incr> LinearMapper;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr) : m_data(data), m_stride(stride), m_incr(incr) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper
|
||||
getSubMapper(Index i, Index j) const {
|
||||
return blas_data_mapper(&operator()(i, j), m_stride, m_incr.value());
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
|
||||
return LinearMapper(&operator()(i, j), m_incr.value());
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
|
||||
return m_data[StorageOrder==RowMajor ? j*m_incr.value() + i*m_stride : i*m_incr.value() + j*m_stride];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
|
||||
return pgather<Scalar,Packet>(&operator()(i, j),m_incr.value());
|
||||
}
|
||||
|
||||
template <typename PacketT, int AlignmentT>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
|
||||
return pgather<Scalar,PacketT>(&operator()(i, j),m_incr.value());
|
||||
}
|
||||
|
||||
template<typename SubPacket>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
|
||||
pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
|
||||
}
|
||||
|
||||
template<typename SubPacket>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
|
||||
return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
|
||||
}
|
||||
|
||||
protected:
|
||||
Scalar* EIGEN_RESTRICT m_data;
|
||||
const Index m_stride;
|
||||
const internal::variable_if_dynamic<Index,Incr> m_incr;
|
||||
};
|
||||
|
||||
// lightweight helper class to access matrix coefficients (const version)
|
||||
template<typename Scalar, typename Index, int StorageOrder>
|
||||
class const_blas_data_mapper : public blas_data_mapper<const Scalar, Index, StorageOrder> {
|
||||
|
||||
521
Eigen/src/Core/util/ConfigureVectorization.h
Normal file
521
Eigen/src/Core/util/ConfigureVectorization.h
Normal file
@@ -0,0 +1,521 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2008-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
|
||||
// Copyright (C) 2020, Arm Limited and Contributors
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CONFIGURE_VECTORIZATION_H
|
||||
#define EIGEN_CONFIGURE_VECTORIZATION_H
|
||||
|
||||
//------------------------------------------------------------------------------------------
|
||||
// Static and dynamic alignment control
|
||||
//
|
||||
// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES
|
||||
// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively.
|
||||
// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not,
|
||||
// a default value is automatically computed based on architecture, compiler, and OS.
|
||||
//
|
||||
// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX}
|
||||
// to be used to declare statically aligned buffers.
|
||||
//------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements.
|
||||
* However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
|
||||
* so that vectorization doesn't affect binary compatibility.
|
||||
*
|
||||
* If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
|
||||
* vectorized and non-vectorized code.
|
||||
*
|
||||
* FIXME: this code can be cleaned up once we switch to proper C++11 only.
|
||||
*/
|
||||
#if (defined EIGEN_CUDACC)
|
||||
#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
|
||||
#define EIGEN_ALIGNOF(x) __alignof(x)
|
||||
#elif EIGEN_HAS_ALIGNAS
|
||||
#define EIGEN_ALIGN_TO_BOUNDARY(n) alignas(n)
|
||||
#define EIGEN_ALIGNOF(x) alignof(x)
|
||||
#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
|
||||
#define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
|
||||
#define EIGEN_ALIGNOF(x) __alignof(x)
|
||||
#elif EIGEN_COMP_MSVC
|
||||
#define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
|
||||
#define EIGEN_ALIGNOF(x) __alignof(x)
|
||||
#elif EIGEN_COMP_SUNCC
|
||||
// FIXME not sure about this one:
|
||||
#define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
|
||||
#define EIGEN_ALIGNOF(x) __alignof(x)
|
||||
#else
|
||||
#error Please tell me what is the equivalent of alignas(n) and alignof(x) for your compiler
|
||||
#endif
|
||||
|
||||
// If the user explicitly disable vectorization, then we also disable alignment
|
||||
#if defined(EIGEN_DONT_VECTORIZE)
|
||||
#if defined(EIGEN_GPUCC)
|
||||
// GPU code is always vectorized and requires memory alignment for
|
||||
// statically allocated buffers.
|
||||
#define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
|
||||
#else
|
||||
#define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
|
||||
#endif
|
||||
#elif defined(__AVX512F__)
|
||||
// 64 bytes static alignment is preferred only if really required
|
||||
#define EIGEN_IDEAL_MAX_ALIGN_BYTES 64
|
||||
#elif defined(__AVX__)
|
||||
// 32 bytes static alignment is preferred only if really required
|
||||
#define EIGEN_IDEAL_MAX_ALIGN_BYTES 32
|
||||
#else
|
||||
#define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
|
||||
#endif
|
||||
|
||||
|
||||
// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense
|
||||
#define EIGEN_MIN_ALIGN_BYTES 16
|
||||
|
||||
// Defined the boundary (in bytes) on which the data needs to be aligned. Note
|
||||
// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
|
||||
// aligned at all regardless of the value of this #define.
|
||||
|
||||
#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)) && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
|
||||
#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY.
|
||||
#endif
|
||||
|
||||
// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprecated
|
||||
// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0
|
||||
#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)
|
||||
#ifdef EIGEN_MAX_STATIC_ALIGN_BYTES
|
||||
#undef EIGEN_MAX_STATIC_ALIGN_BYTES
|
||||
#endif
|
||||
#define EIGEN_MAX_STATIC_ALIGN_BYTES 0
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES
|
||||
|
||||
// Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES
|
||||
|
||||
// 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
|
||||
// 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
|
||||
// enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
|
||||
// certain common platform (compiler+architecture combinations) to avoid these problems.
|
||||
// Only static alignment is really problematic (relies on nonstandard compiler extensions),
|
||||
// try to keep heap alignment even when we have to disable static alignment.
|
||||
#if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS)
|
||||
#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
|
||||
#elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6)
|
||||
// Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support.
|
||||
// Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use.
|
||||
// 4.8 and newer seem definitely unaffected.
|
||||
#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
|
||||
#else
|
||||
#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
|
||||
#endif
|
||||
|
||||
// static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
|
||||
#if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
|
||||
&& !EIGEN_GCC3_OR_OLDER \
|
||||
&& !EIGEN_COMP_SUNCC \
|
||||
&& !EIGEN_OS_QNX
|
||||
#define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
|
||||
#else
|
||||
#define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
|
||||
#endif
|
||||
|
||||
#if EIGEN_ARCH_WANTS_STACK_ALIGNMENT
|
||||
#define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
|
||||
#else
|
||||
#define EIGEN_MAX_STATIC_ALIGN_BYTES 0
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_STATIC_ALIGN_BYTES
|
||||
#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES<EIGEN_MAX_STATIC_ALIGN_BYTES
|
||||
#undef EIGEN_MAX_STATIC_ALIGN_BYTES
|
||||
#define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
|
||||
#endif
|
||||
|
||||
#if EIGEN_MAX_STATIC_ALIGN_BYTES==0 && !defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
|
||||
#define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
|
||||
#endif
|
||||
|
||||
// At this stage, EIGEN_MAX_STATIC_ALIGN_BYTES>0 is the true test whether we want to align arrays on the stack or not.
|
||||
// It takes into account both the user choice to explicitly enable/disable alignment (by setting EIGEN_MAX_STATIC_ALIGN_BYTES)
|
||||
// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT).
|
||||
// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used.
|
||||
|
||||
|
||||
// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY
|
||||
#define EIGEN_ALIGN8 EIGEN_ALIGN_TO_BOUNDARY(8)
|
||||
#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
|
||||
#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
|
||||
#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64)
|
||||
#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
|
||||
#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES)
|
||||
#else
|
||||
#define EIGEN_ALIGN_MAX
|
||||
#endif
|
||||
|
||||
|
||||
// Dynamic alignment control
|
||||
|
||||
#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0
|
||||
#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN.
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_DONT_ALIGN
|
||||
#ifdef EIGEN_MAX_ALIGN_BYTES
|
||||
#undef EIGEN_MAX_ALIGN_BYTES
|
||||
#endif
|
||||
#define EIGEN_MAX_ALIGN_BYTES 0
|
||||
#elif !defined(EIGEN_MAX_ALIGN_BYTES)
|
||||
#define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
|
||||
#endif
|
||||
|
||||
#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES
|
||||
#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
|
||||
#else
|
||||
#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef EIGEN_UNALIGNED_VECTORIZE
|
||||
#define EIGEN_UNALIGNED_VECTORIZE 1
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
|
||||
// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
|
||||
#if EIGEN_MAX_ALIGN_BYTES==0
|
||||
#ifndef EIGEN_DONT_VECTORIZE
|
||||
#define EIGEN_DONT_VECTORIZE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
// The following (except #include <malloc.h> and _M_IX86_FP ??) can likely be
|
||||
// removed as gcc 4.1 and msvc 2008 are not supported anyways.
|
||||
#if EIGEN_COMP_MSVC
|
||||
#include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
|
||||
#if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
|
||||
// a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
|
||||
#if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
|
||||
#define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
|
||||
#define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if !(defined(EIGEN_DONT_VECTORIZE) || defined(EIGEN_GPUCC))
|
||||
|
||||
#if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
|
||||
|
||||
// Defines symbols for compile-time detection of which instructions are
|
||||
// used.
|
||||
// EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
|
||||
#define EIGEN_VECTORIZE
|
||||
#define EIGEN_VECTORIZE_SSE
|
||||
#define EIGEN_VECTORIZE_SSE2
|
||||
|
||||
// Detect sse3/ssse3/sse4:
|
||||
// gcc and icc defines __SSE3__, ...
|
||||
// there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
|
||||
// want to force the use of those instructions with msvc.
|
||||
#ifdef __SSE3__
|
||||
#define EIGEN_VECTORIZE_SSE3
|
||||
#endif
|
||||
#ifdef __SSSE3__
|
||||
#define EIGEN_VECTORIZE_SSSE3
|
||||
#endif
|
||||
#ifdef __SSE4_1__
|
||||
#define EIGEN_VECTORIZE_SSE4_1
|
||||
#endif
|
||||
#ifdef __SSE4_2__
|
||||
#define EIGEN_VECTORIZE_SSE4_2
|
||||
#endif
|
||||
#ifdef __AVX__
|
||||
#ifndef EIGEN_USE_SYCL
|
||||
#define EIGEN_VECTORIZE_AVX
|
||||
#endif
|
||||
#define EIGEN_VECTORIZE_SSE3
|
||||
#define EIGEN_VECTORIZE_SSSE3
|
||||
#define EIGEN_VECTORIZE_SSE4_1
|
||||
#define EIGEN_VECTORIZE_SSE4_2
|
||||
#endif
|
||||
#ifdef __AVX2__
|
||||
#ifndef EIGEN_USE_SYCL
|
||||
#define EIGEN_VECTORIZE_AVX2
|
||||
#define EIGEN_VECTORIZE_AVX
|
||||
#endif
|
||||
#define EIGEN_VECTORIZE_SSE3
|
||||
#define EIGEN_VECTORIZE_SSSE3
|
||||
#define EIGEN_VECTORIZE_SSE4_1
|
||||
#define EIGEN_VECTORIZE_SSE4_2
|
||||
#endif
|
||||
#if defined(__FMA__) || (EIGEN_COMP_MSVC && defined(__AVX2__))
|
||||
// MSVC does not expose a switch dedicated for FMA
|
||||
// For MSVC, AVX2 => FMA
|
||||
#define EIGEN_VECTORIZE_FMA
|
||||
#endif
|
||||
#if defined(__AVX512F__)
|
||||
#ifndef EIGEN_VECTORIZE_FMA
|
||||
#if EIGEN_COMP_GNUC
|
||||
#error Please add -mfma to your compiler flags: compiling with -mavx512f alone without SSE/AVX FMA is not supported (bug 1638).
|
||||
#else
|
||||
#error Please enable FMA in your compiler flags (e.g. -mfma): compiling with AVX512 alone without SSE/AVX FMA is not supported (bug 1638).
|
||||
#endif
|
||||
#endif
|
||||
#ifndef EIGEN_USE_SYCL
|
||||
#define EIGEN_VECTORIZE_AVX512
|
||||
#define EIGEN_VECTORIZE_AVX2
|
||||
#define EIGEN_VECTORIZE_AVX
|
||||
#endif
|
||||
#define EIGEN_VECTORIZE_FMA
|
||||
#define EIGEN_VECTORIZE_SSE3
|
||||
#define EIGEN_VECTORIZE_SSSE3
|
||||
#define EIGEN_VECTORIZE_SSE4_1
|
||||
#define EIGEN_VECTORIZE_SSE4_2
|
||||
#ifndef EIGEN_USE_SYCL
|
||||
#ifdef __AVX512DQ__
|
||||
#define EIGEN_VECTORIZE_AVX512DQ
|
||||
#endif
|
||||
#ifdef __AVX512ER__
|
||||
#define EIGEN_VECTORIZE_AVX512ER
|
||||
#endif
|
||||
#ifdef __AVX512BF16__
|
||||
#define EIGEN_VECTORIZE_AVX512BF16
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Disable AVX support on broken xcode versions
|
||||
#if defined(__apple_build_version__) && (__apple_build_version__ == 11000033 ) && ( __MAC_OS_X_VERSION_MIN_REQUIRED == 101500 )
|
||||
// A nasty bug in the clang compiler shipped with xcode in a common compilation situation
|
||||
// when XCode 11.0 and Mac deployment target macOS 10.15 is https://trac.macports.org/ticket/58776#no1
|
||||
#ifdef EIGEN_VECTORIZE_AVX
|
||||
#undef EIGEN_VECTORIZE_AVX
|
||||
#warning "Disabling AVX support: clang compiler shipped with XCode 11.[012] generates broken assembly with -macosx-version-min=10.15 and AVX enabled. "
|
||||
#ifdef EIGEN_VECTORIZE_AVX2
|
||||
#undef EIGEN_VECTORIZE_AVX2
|
||||
#endif
|
||||
#ifdef EIGEN_VECTORIZE_FMA
|
||||
#undef EIGEN_VECTORIZE_FMA
|
||||
#endif
|
||||
#ifdef EIGEN_VECTORIZE_AVX512
|
||||
#undef EIGEN_VECTORIZE_AVX512
|
||||
#endif
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
#undef EIGEN_VECTORIZE_AVX512DQ
|
||||
#endif
|
||||
#ifdef EIGEN_VECTORIZE_AVX512ER
|
||||
#undef EIGEN_VECTORIZE_AVX512ER
|
||||
#endif
|
||||
#endif
|
||||
// NOTE: Confirmed test failures in XCode 11.0, and XCode 11.2 with -macosx-version-min=10.15 and AVX
|
||||
// NOTE using -macosx-version-min=10.15 with Xcode 11.0 results in runtime segmentation faults in many tests, 11.2 produce core dumps in 3 tests
|
||||
// NOTE using -macosx-version-min=10.14 produces functioning and passing tests in all cases
|
||||
// NOTE __clang_version__ "11.0.0 (clang-1100.0.33.8)" XCode 11.0 <- Produces many segfault and core dumping tests
|
||||
// with -macosx-version-min=10.15 and AVX
|
||||
// NOTE __clang_version__ "11.0.0 (clang-1100.0.33.12)" XCode 11.2 <- Produces 3 core dumping tests with
|
||||
// -macosx-version-min=10.15 and AVX
|
||||
#endif
|
||||
|
||||
// include files
|
||||
|
||||
// This extern "C" works around a MINGW-w64 compilation issue
|
||||
// https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
|
||||
// In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
|
||||
// However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
|
||||
// with conflicting linkage. The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
|
||||
// so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
|
||||
// notice that since these are C headers, the extern "C" is theoretically needed anyways.
|
||||
extern "C" {
|
||||
// In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
|
||||
// Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
|
||||
#if EIGEN_COMP_ICC >= 1110 || EIGEN_COMP_EMSCRIPTEN
|
||||
#include <immintrin.h>
|
||||
#else
|
||||
#include <mmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
#include <xmmintrin.h>
|
||||
#ifdef EIGEN_VECTORIZE_SSE3
|
||||
#include <pmmintrin.h>
|
||||
#endif
|
||||
#ifdef EIGEN_VECTORIZE_SSSE3
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
#ifdef EIGEN_VECTORIZE_SSE4_2
|
||||
#include <nmmintrin.h>
|
||||
#endif
|
||||
#if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
} // end extern "C"
|
||||
|
||||
#elif defined(__VSX__) && !defined(__APPLE__)
|
||||
|
||||
#define EIGEN_VECTORIZE
|
||||
#define EIGEN_VECTORIZE_VSX 1
|
||||
#define EIGEN_VECTORIZE_FMA
|
||||
#include <altivec.h>
|
||||
// We need to #undef all these ugly tokens defined in <altivec.h>
|
||||
// => use __vector instead of vector
|
||||
#undef bool
|
||||
#undef vector
|
||||
#undef pixel
|
||||
|
||||
#elif defined __ALTIVEC__
|
||||
|
||||
#define EIGEN_VECTORIZE
|
||||
#define EIGEN_VECTORIZE_ALTIVEC
|
||||
#define EIGEN_VECTORIZE_FMA
|
||||
#include <altivec.h>
|
||||
// We need to #undef all these ugly tokens defined in <altivec.h>
|
||||
// => use __vector instead of vector
|
||||
#undef bool
|
||||
#undef vector
|
||||
#undef pixel
|
||||
|
||||
#elif ((defined __ARM_NEON) || (defined __ARM_NEON__)) && !(defined EIGEN_ARM64_USE_SVE)
|
||||
|
||||
#define EIGEN_VECTORIZE
|
||||
#define EIGEN_VECTORIZE_NEON
|
||||
#include <arm_neon.h>
|
||||
|
||||
// We currently require SVE to be enabled explicitly via EIGEN_ARM64_USE_SVE and
|
||||
// will not select the backend automatically
|
||||
#elif (defined __ARM_FEATURE_SVE) && (defined EIGEN_ARM64_USE_SVE)
|
||||
|
||||
#define EIGEN_VECTORIZE
|
||||
#define EIGEN_VECTORIZE_SVE
|
||||
#include <arm_sve.h>
|
||||
|
||||
// Since we depend on knowing SVE vector lengths at compile-time, we need
|
||||
// to ensure a fixed lengths is set
|
||||
#if defined __ARM_FEATURE_SVE_BITS
|
||||
#define EIGEN_ARM64_SVE_VL __ARM_FEATURE_SVE_BITS
|
||||
#else
|
||||
#error "Eigen requires a fixed SVE lector length but EIGEN_ARM64_SVE_VL is not set."
|
||||
#endif
|
||||
|
||||
#elif (defined __s390x__ && defined __VEC__)
|
||||
|
||||
#define EIGEN_VECTORIZE
|
||||
#define EIGEN_VECTORIZE_ZVECTOR
|
||||
#include <vecintrin.h>
|
||||
|
||||
#elif defined __mips_msa
|
||||
|
||||
// Limit MSA optimizations to little-endian CPUs for now.
|
||||
// TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs?
|
||||
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
|
||||
#if defined(__LP64__)
|
||||
#define EIGEN_MIPS_64
|
||||
#else
|
||||
#define EIGEN_MIPS_32
|
||||
#endif
|
||||
#define EIGEN_VECTORIZE
|
||||
#define EIGEN_VECTORIZE_MSA
|
||||
#include <msa.h>
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Following the Arm ACLE arm_neon.h should also include arm_fp16.h but not all
|
||||
// compilers seem to follow this. We therefore include it explicitly.
|
||||
// See also: https://bugs.llvm.org/show_bug.cgi?id=47955
|
||||
#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
|
||||
#include <arm_fp16.h>
|
||||
#endif
|
||||
|
||||
// Enable FMA for ARM.
|
||||
#if defined(__ARM_FEATURE_FMA)
|
||||
#define EIGEN_VECTORIZE_FMA
|
||||
#endif
|
||||
|
||||
#if defined(__F16C__) && !defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG_STRICT || EIGEN_COMP_CLANG>=380)
|
||||
// We can use the optimized fp16 to float and float to fp16 conversion routines
|
||||
#define EIGEN_HAS_FP16_C
|
||||
|
||||
#if EIGEN_COMP_GNUC
|
||||
// Make sure immintrin.h is included, even if e.g. vectorization is
|
||||
// explicitly disabled (see also issue #2395).
|
||||
// Note that FP16C intrinsics for gcc and clang are included by immintrin.h,
|
||||
// as opposed to emmintrin.h as suggested by Intel:
|
||||
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined EIGEN_CUDACC
|
||||
#define EIGEN_VECTORIZE_GPU
|
||||
#include <vector_types.h>
|
||||
#if EIGEN_CUDA_SDK_VER >= 70500
|
||||
#define EIGEN_HAS_CUDA_FP16
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_HAS_CUDA_FP16)
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_HIPCC)
|
||||
#define EIGEN_VECTORIZE_GPU
|
||||
#include <hip/hip_vector_types.h>
|
||||
#define EIGEN_HAS_HIP_FP16
|
||||
#include <hip/hip_fp16.h>
|
||||
#endif
|
||||
|
||||
|
||||
/** \brief Namespace containing all symbols from the %Eigen library. */
|
||||
namespace Eigen {
|
||||
|
||||
inline static const char *SimdInstructionSetsInUse(void) {
|
||||
#if defined(EIGEN_VECTORIZE_AVX512)
|
||||
return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
|
||||
#elif defined(EIGEN_VECTORIZE_AVX)
|
||||
return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
|
||||
#elif defined(EIGEN_VECTORIZE_SSE4_2)
|
||||
return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
|
||||
#elif defined(EIGEN_VECTORIZE_SSE4_1)
|
||||
return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
|
||||
#elif defined(EIGEN_VECTORIZE_SSSE3)
|
||||
return "SSE, SSE2, SSE3, SSSE3";
|
||||
#elif defined(EIGEN_VECTORIZE_SSE3)
|
||||
return "SSE, SSE2, SSE3";
|
||||
#elif defined(EIGEN_VECTORIZE_SSE2)
|
||||
return "SSE, SSE2";
|
||||
#elif defined(EIGEN_VECTORIZE_ALTIVEC)
|
||||
return "AltiVec";
|
||||
#elif defined(EIGEN_VECTORIZE_VSX)
|
||||
return "VSX";
|
||||
#elif defined(EIGEN_VECTORIZE_NEON)
|
||||
return "ARM NEON";
|
||||
#elif defined(EIGEN_VECTORIZE_SVE)
|
||||
return "ARM SVE";
|
||||
#elif defined(EIGEN_VECTORIZE_ZVECTOR)
|
||||
return "S390X ZVECTOR";
|
||||
#elif defined(EIGEN_VECTORIZE_MSA)
|
||||
return "MIPS MSA";
|
||||
#else
|
||||
return "None";
|
||||
#endif
|
||||
}
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
|
||||
#endif // EIGEN_CONFIGURE_VECTORIZATION_H
|
||||
@@ -1,83 +1,146 @@
|
||||
#ifndef EIGEN_WARNINGS_DISABLED
|
||||
#define EIGEN_WARNINGS_DISABLED
|
||||
|
||||
#ifdef _MSC_VER
|
||||
// 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p))
|
||||
// 4101 - unreferenced local variable
|
||||
// 4127 - conditional expression is constant
|
||||
// 4181 - qualifier applied to reference type ignored
|
||||
// 4211 - nonstandard extension used : redefined extern to static
|
||||
// 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data
|
||||
// 4273 - QtAlignedMalloc, inconsistent DLL linkage
|
||||
// 4324 - structure was padded due to declspec(align())
|
||||
// 4503 - decorated name length exceeded, name was truncated
|
||||
// 4512 - assignment operator could not be generated
|
||||
// 4522 - 'class' : multiple assignment operators specified
|
||||
// 4700 - uninitialized local variable 'xyz' used
|
||||
// 4714 - function marked as __forceinline not inlined
|
||||
// 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow
|
||||
// 4800 - 'type' : forcing value to bool 'true' or 'false' (performance warning)
|
||||
#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
|
||||
#pragma warning( push )
|
||||
#endif
|
||||
#pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
|
||||
#if defined(_MSC_VER)
|
||||
// 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p))
|
||||
// 4101 - unreferenced local variable
|
||||
// 4127 - conditional expression is constant
|
||||
// 4181 - qualifier applied to reference type ignored
|
||||
// 4211 - nonstandard extension used : redefined extern to static
|
||||
// 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data
|
||||
// 4273 - QtAlignedMalloc, inconsistent DLL linkage
|
||||
// 4324 - structure was padded due to declspec(align())
|
||||
// 4503 - decorated name length exceeded, name was truncated
|
||||
// 4512 - assignment operator could not be generated
|
||||
// 4522 - 'class' : multiple assignment operators specified
|
||||
// 4700 - uninitialized local variable 'xyz' used
|
||||
// 4714 - function marked as __forceinline not inlined
|
||||
// 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow
|
||||
// 4800 - 'type' : forcing value to bool 'true' or 'false' (performance warning)
|
||||
#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
|
||||
#pragma warning(push)
|
||||
#endif
|
||||
#pragma warning(disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
|
||||
// We currently rely on has_denorm in tests, and need it defined correctly for half/bfloat16.
|
||||
#ifndef _SILENCE_CXX23_DENORM_DEPRECATION_WARNING
|
||||
#define EIGEN_REENABLE_CXX23_DENORM_DEPRECATION_WARNING 1
|
||||
#define _SILENCE_CXX23_DENORM_DEPRECATION_WARNING
|
||||
#endif
|
||||
|
||||
#elif defined __INTEL_COMPILER
|
||||
// 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
|
||||
// ICC 12 generates this warning even without any inline keyword, when defining class methods 'inline' i.e. inside of class body
|
||||
// typedef that may be a reference type.
|
||||
// 279 - controlling expression is constant
|
||||
// ICC 12 generates this warning on assert(constant_expression_depending_on_template_params) and frankly this is a legitimate use case.
|
||||
// 1684 - conversion from pointer to same-sized integral type (potential portability problem)
|
||||
// 2259 - non-pointer conversion from "Eigen::Index={ptrdiff_t={long}}" to "int" may lose significant bits
|
||||
#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
|
||||
#pragma warning push
|
||||
#endif
|
||||
#pragma warning disable 2196 279 1684 2259
|
||||
// 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
|
||||
// ICC 12 generates this warning even without any inline keyword, when defining class methods 'inline' i.e.
|
||||
// inside of class body typedef that may be a reference type.
|
||||
// 279 - controlling expression is constant
|
||||
// ICC 12 generates this warning on assert(constant_expression_depending_on_template_params) and frankly this is
|
||||
// a legitimate use case.
|
||||
// 1684 - conversion from pointer to same-sized integral type (potential portability problem)
|
||||
// 2259 - non-pointer conversion from "Eigen::Index={ptrdiff_t={long}}" to "int" may lose significant bits
|
||||
#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
|
||||
#pragma warning push
|
||||
#endif
|
||||
#pragma warning disable 2196 279 1684 2259
|
||||
|
||||
#elif defined __clang__
|
||||
// -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
|
||||
// this is really a stupid warning as it warns on compile-time expressions involving enums
|
||||
#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
|
||||
#pragma clang diagnostic push
|
||||
#endif
|
||||
#pragma clang diagnostic ignored "-Wconstant-logical-operand"
|
||||
|
||||
#elif defined __GNUC__
|
||||
|
||||
#if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
|
||||
#pragma GCC diagnostic push
|
||||
#endif
|
||||
// g++ warns about local variables shadowing member functions, which is too strict
|
||||
#pragma GCC diagnostic ignored "-Wshadow"
|
||||
#if __GNUC__ == 4 && __GNUC_MINOR__ < 8
|
||||
// Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions:
|
||||
#pragma GCC diagnostic ignored "-Wtype-limits"
|
||||
#endif
|
||||
#if __GNUC__>=6
|
||||
#pragma GCC diagnostic ignored "-Wignored-attributes"
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
|
||||
#pragma clang diagnostic push
|
||||
#endif
|
||||
#if defined(__has_warning)
|
||||
// -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
|
||||
// this is really a stupid warning as it warns on compile-time expressions involving enums
|
||||
#if __has_warning("-Wconstant-logical-operand")
|
||||
#pragma clang diagnostic ignored "-Wconstant-logical-operand"
|
||||
#endif
|
||||
#if __has_warning("-Wimplicit-int-float-conversion")
|
||||
#pragma clang diagnostic ignored "-Wimplicit-int-float-conversion"
|
||||
#endif
|
||||
#if (defined(__ALTIVEC__) || defined(__VSX__)) && (!defined(__STDC_VERSION__) || (__STDC_VERSION__ < 201112L))
|
||||
// warning: generic selections are a C11-specific feature
|
||||
// ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h
|
||||
#if __has_warning("-Wc11-extensions")
|
||||
#pragma clang diagnostic ignored "-Wc11-extensions"
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined __NVCC__
|
||||
// Disable the "statement is unreachable" message
|
||||
#pragma diag_suppress code_is_unreachable
|
||||
// Disable the "dynamic initialization in unreachable code" message
|
||||
#pragma diag_suppress initialization_not_reachable
|
||||
// Disable the "invalid error number" message that we get with older versions of nvcc
|
||||
#pragma diag_suppress 1222
|
||||
// Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are many of them and they seem to change with every version of the compiler)
|
||||
#pragma diag_suppress 2527
|
||||
#pragma diag_suppress 2529
|
||||
#pragma diag_suppress 2651
|
||||
#pragma diag_suppress 2653
|
||||
#pragma diag_suppress 2668
|
||||
#pragma diag_suppress 2669
|
||||
#pragma diag_suppress 2670
|
||||
#pragma diag_suppress 2671
|
||||
#pragma diag_suppress 2735
|
||||
#pragma diag_suppress 2737
|
||||
#elif defined __GNUC__ && !defined(__FUJITSU)
|
||||
|
||||
#if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
|
||||
#pragma GCC diagnostic push
|
||||
#endif
|
||||
// g++ warns about local variables shadowing member functions, which is too strict
|
||||
#pragma GCC diagnostic ignored "-Wshadow"
|
||||
#if __GNUC__ == 4 && __GNUC_MINOR__ < 8
|
||||
// Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions:
|
||||
#pragma GCC diagnostic ignored "-Wtype-limits"
|
||||
#endif
|
||||
#if __GNUC__ >= 6
|
||||
#pragma GCC diagnostic ignored "-Wignored-attributes"
|
||||
#endif
|
||||
#if __GNUC__ == 7
|
||||
// See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89325
|
||||
#pragma GCC diagnostic ignored "-Wattributes"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif // not EIGEN_WARNINGS_DISABLED
|
||||
#if defined __NVCC__ && defined __CUDACC__
|
||||
// MSVC 14.16 (required by CUDA 9.*) does not support the _Pragma keyword, so
|
||||
// we instead use Microsoft's __pragma extension.
|
||||
#if defined _MSC_VER
|
||||
#define EIGEN_MAKE_PRAGMA(X) __pragma(#X)
|
||||
#else
|
||||
#define EIGEN_MAKE_PRAGMA(X) _Pragma(#X)
|
||||
#endif
|
||||
#if defined __NVCC_DIAG_PRAGMA_SUPPORT__
|
||||
#define EIGEN_NV_DIAG_SUPPRESS(X) EIGEN_MAKE_PRAGMA(nv_diag_suppress X)
|
||||
#else
|
||||
#define EIGEN_NV_DIAG_SUPPRESS(X) EIGEN_MAKE_PRAGMA(diag_suppress X)
|
||||
#endif
|
||||
|
||||
EIGEN_NV_DIAG_SUPPRESS(boolean_controlling_expr_is_constant)
|
||||
// Disable the "statement is unreachable" message
|
||||
EIGEN_NV_DIAG_SUPPRESS(code_is_unreachable)
|
||||
// Disable the "dynamic initialization in unreachable code" message
|
||||
EIGEN_NV_DIAG_SUPPRESS(initialization_not_reachable)
|
||||
// Disable the "invalid error number" message that we get with older versions of nvcc
|
||||
EIGEN_NV_DIAG_SUPPRESS(1222)
|
||||
// Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are
|
||||
// many of them and they seem to change with every version of the compiler)
|
||||
EIGEN_NV_DIAG_SUPPRESS(2527)
|
||||
EIGEN_NV_DIAG_SUPPRESS(2529)
|
||||
EIGEN_NV_DIAG_SUPPRESS(2651)
|
||||
EIGEN_NV_DIAG_SUPPRESS(2653)
|
||||
EIGEN_NV_DIAG_SUPPRESS(2668)
|
||||
EIGEN_NV_DIAG_SUPPRESS(2669)
|
||||
EIGEN_NV_DIAG_SUPPRESS(2670)
|
||||
EIGEN_NV_DIAG_SUPPRESS(2671)
|
||||
EIGEN_NV_DIAG_SUPPRESS(2735)
|
||||
EIGEN_NV_DIAG_SUPPRESS(2737)
|
||||
EIGEN_NV_DIAG_SUPPRESS(2739)
|
||||
EIGEN_NV_DIAG_SUPPRESS(2885)
|
||||
EIGEN_NV_DIAG_SUPPRESS(2888)
|
||||
EIGEN_NV_DIAG_SUPPRESS(2976)
|
||||
EIGEN_NV_DIAG_SUPPRESS(2979)
|
||||
EIGEN_NV_DIAG_SUPPRESS(20011)
|
||||
EIGEN_NV_DIAG_SUPPRESS(20014)
|
||||
// Disable the "// __device__ annotation is ignored on a function(...) that is
|
||||
// explicitly defaulted on its first declaration" message.
|
||||
// The __device__ annotation seems to actually be needed in some cases,
|
||||
// otherwise resulting in kernel runtime errors.
|
||||
EIGEN_NV_DIAG_SUPPRESS(2886)
|
||||
EIGEN_NV_DIAG_SUPPRESS(2929)
|
||||
EIGEN_NV_DIAG_SUPPRESS(2977)
|
||||
EIGEN_NV_DIAG_SUPPRESS(20012)
|
||||
#undef EIGEN_NV_DIAG_SUPPRESS
|
||||
#undef EIGEN_MAKE_PRAGMA
|
||||
#endif
|
||||
|
||||
#else
|
||||
// warnings already disabled:
|
||||
#ifndef EIGEN_WARNINGS_DISABLED_2
|
||||
#define EIGEN_WARNINGS_DISABLED_2
|
||||
#elif defined(EIGEN_INTERNAL_DEBUGGING)
|
||||
#error "Do not include \"DisableStupidWarnings.h\" recursively more than twice!"
|
||||
#endif
|
||||
|
||||
#endif // not EIGEN_WARNINGS_DISABLED
|
||||
|
||||
@@ -47,11 +47,7 @@ template<typename T> struct NumTraits;
|
||||
template<typename Derived> struct EigenBase;
|
||||
template<typename Derived> class DenseBase;
|
||||
template<typename Derived> class PlainObjectBase;
|
||||
|
||||
|
||||
template<typename Derived,
|
||||
int Level = internal::accessors_level<Derived>::value >
|
||||
class DenseCoeffsBase;
|
||||
template<typename Derived, int Level> class DenseCoeffsBase;
|
||||
|
||||
template<typename _Scalar, int _Rows, int _Cols,
|
||||
int _Options = AutoAlign |
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
|
||||
#define EIGEN_WORLD_VERSION 3
|
||||
#define EIGEN_MAJOR_VERSION 3
|
||||
#define EIGEN_MINOR_VERSION 7
|
||||
#define EIGEN_MINOR_VERSION 9
|
||||
|
||||
#define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
|
||||
(EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
|
||||
@@ -380,7 +380,8 @@
|
||||
#if EIGEN_MAX_CPP_VER>=11 && \
|
||||
((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901)) \
|
||||
|| (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \
|
||||
|| (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)))
|
||||
|| (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) \
|
||||
|| (EIGEN_COMP_MSVC >= 1900) )
|
||||
#define EIGEN_HAS_C99_MATH 1
|
||||
#else
|
||||
#define EIGEN_HAS_C99_MATH 0
|
||||
@@ -396,6 +397,20 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Does the compiler support type_traits?
|
||||
// - full support of type traits was added only to GCC 5.1.0.
|
||||
// - 20150626 corresponds to the last release of 4.x libstdc++
|
||||
#ifndef EIGEN_HAS_TYPE_TRAITS
|
||||
#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || EIGEN_COMP_MSVC >= 1700) \
|
||||
&& ((!EIGEN_COMP_GNUC_STRICT) || EIGEN_GNUC_AT_LEAST(5, 1)) \
|
||||
&& ((!defined(__GLIBCXX__)) || __GLIBCXX__ > 20150626)
|
||||
#define EIGEN_HAS_TYPE_TRAITS 1
|
||||
#define EIGEN_INCLUDE_TYPE_TRAITS
|
||||
#else
|
||||
#define EIGEN_HAS_TYPE_TRAITS 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Does the compiler support variadic templates?
|
||||
#ifndef EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
|
||||
@@ -835,11 +850,48 @@ namespace Eigen {
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* \internal
|
||||
* \brief Macro to explicitly define the default copy constructor.
|
||||
* This is necessary, because the implicit definition is deprecated if the copy-assignment is overridden.
|
||||
*/
|
||||
#if EIGEN_HAS_CXX11
|
||||
#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) EIGEN_DEVICE_FUNC CLASS(const CLASS&) = default;
|
||||
#else
|
||||
#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS)
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/** \internal
|
||||
* \brief Macro to manually inherit assignment operators.
|
||||
* This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined.
|
||||
* With C++11 or later this also default-implements the copy-constructor
|
||||
*/
|
||||
#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)
|
||||
#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
|
||||
EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
|
||||
EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived)
|
||||
|
||||
/** \internal
|
||||
* \brief Macro to manually define default constructors and destructors.
|
||||
* This is necessary when the copy constructor is re-defined.
|
||||
* For empty helper classes this should usually be protected, to avoid accidentally creating empty objects.
|
||||
*
|
||||
* Hiding the default destructor lead to problems in C++03 mode together with boost::multiprecision
|
||||
*/
|
||||
#if EIGEN_HAS_CXX11
|
||||
#define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \
|
||||
EIGEN_DEVICE_FUNC Derived() = default; \
|
||||
EIGEN_DEVICE_FUNC ~Derived() = default;
|
||||
#else
|
||||
#define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \
|
||||
EIGEN_DEVICE_FUNC Derived() {}; \
|
||||
/* EIGEN_DEVICE_FUNC ~Derived() {}; */
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Just a side note. Commenting within defines works only by documenting
|
||||
|
||||
@@ -16,8 +16,40 @@
|
||||
#include <math_constants.h>
|
||||
#endif
|
||||
|
||||
#if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L
|
||||
// Recent versions of ICC require <cstdint> for pointer types below.
|
||||
#define EIGEN_ICC_NEEDS_CSTDINT (EIGEN_COMP_ICC>=1600 && EIGEN_COMP_CXXVER >= 11)
|
||||
|
||||
// Define portable (u)int{32,64} types
|
||||
#if EIGEN_HAS_CXX11 || EIGEN_ICC_NEEDS_CSTDINT
|
||||
#include <cstdint>
|
||||
namespace Eigen {
|
||||
namespace numext {
|
||||
typedef std::uint8_t uint8_t;
|
||||
typedef std::int8_t int8_t;
|
||||
typedef std::uint16_t uint16_t;
|
||||
typedef std::int16_t int16_t;
|
||||
typedef std::uint32_t uint32_t;
|
||||
typedef std::int32_t int32_t;
|
||||
typedef std::uint64_t uint64_t;
|
||||
typedef std::int64_t int64_t;
|
||||
}
|
||||
}
|
||||
#else
|
||||
// Without c++11, all compilers able to compile Eigen also
|
||||
// provide the C99 stdint.h header file.
|
||||
#include <stdint.h>
|
||||
namespace Eigen {
|
||||
namespace numext {
|
||||
typedef ::uint8_t uint8_t;
|
||||
typedef ::int8_t int8_t;
|
||||
typedef ::uint16_t uint16_t;
|
||||
typedef ::int16_t int16_t;
|
||||
typedef ::uint32_t uint32_t;
|
||||
typedef ::int32_t int32_t;
|
||||
typedef ::uint64_t uint64_t;
|
||||
typedef ::int64_t int64_t;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace Eigen {
|
||||
@@ -43,13 +75,14 @@ namespace internal {
|
||||
|
||||
// Only recent versions of ICC complain about using ptrdiff_t to hold pointers,
|
||||
// and older versions do not provide *intptr_t types.
|
||||
#if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L
|
||||
#if EIGEN_ICC_NEEDS_CSTDINT
|
||||
typedef std::intptr_t IntPtr;
|
||||
typedef std::uintptr_t UIntPtr;
|
||||
#else
|
||||
typedef std::ptrdiff_t IntPtr;
|
||||
typedef std::size_t UIntPtr;
|
||||
#endif
|
||||
#undef EIGEN_ICC_NEEDS_CSTDINT
|
||||
|
||||
struct true_type { enum { value = 1 }; };
|
||||
struct false_type { enum { value = 0 }; };
|
||||
@@ -97,6 +130,9 @@ template<> struct is_arithmetic<unsigned int> { enum { value = true }; };
|
||||
template<> struct is_arithmetic<signed long> { enum { value = true }; };
|
||||
template<> struct is_arithmetic<unsigned long> { enum { value = true }; };
|
||||
|
||||
#if EIGEN_HAS_CXX11
|
||||
using std::is_integral;
|
||||
#else
|
||||
template<typename T> struct is_integral { enum { value = false }; };
|
||||
template<> struct is_integral<bool> { enum { value = true }; };
|
||||
template<> struct is_integral<char> { enum { value = true }; };
|
||||
@@ -108,6 +144,11 @@ template<> struct is_integral<signed int> { enum { value = true }; };
|
||||
template<> struct is_integral<unsigned int> { enum { value = true }; };
|
||||
template<> struct is_integral<signed long> { enum { value = true }; };
|
||||
template<> struct is_integral<unsigned long> { enum { value = true }; };
|
||||
#if EIGEN_COMP_MSVC
|
||||
template<> struct is_integral<signed __int64> { enum { value = true }; };
|
||||
template<> struct is_integral<unsigned __int64>{ enum { value = true }; };
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if EIGEN_HAS_CXX11
|
||||
using std::make_unsigned;
|
||||
@@ -125,10 +166,8 @@ template<> struct make_unsigned<signed int> { typedef unsigned int type; }
|
||||
template<> struct make_unsigned<unsigned int> { typedef unsigned int type; };
|
||||
template<> struct make_unsigned<signed long> { typedef unsigned long type; };
|
||||
template<> struct make_unsigned<unsigned long> { typedef unsigned long type; };
|
||||
#if EIGEN_COMP_MSVC
|
||||
template<> struct make_unsigned<signed __int64> { typedef unsigned __int64 type; };
|
||||
template<> struct make_unsigned<unsigned __int64> { typedef unsigned __int64 type; };
|
||||
#endif
|
||||
template<> struct make_unsigned<signed long long> { typedef unsigned long type; };
|
||||
template<> struct make_unsigned<unsigned long long> { typedef unsigned long type; };
|
||||
#endif
|
||||
|
||||
template <typename T> struct add_const { typedef const T type; };
|
||||
@@ -486,7 +525,7 @@ template<typename T, typename U> struct scalar_product_traits
|
||||
} // end namespace internal
|
||||
|
||||
namespace numext {
|
||||
|
||||
|
||||
#if defined(__CUDA_ARCH__)
|
||||
template<typename T> EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
|
||||
#else
|
||||
@@ -502,7 +541,7 @@ using std::numeric_limits;
|
||||
// Integer division with rounding up.
|
||||
// T is assumed to be an integer type with a>=0, and b>0
|
||||
template<typename T>
|
||||
T div_ceil(const T &a, const T &b)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T div_ceil(const T &a, const T &b)
|
||||
{
|
||||
return (a+b-1) / b;
|
||||
}
|
||||
|
||||
@@ -1,4 +1,8 @@
|
||||
#ifdef EIGEN_WARNINGS_DISABLED
|
||||
#ifdef EIGEN_WARNINGS_DISABLED_2
|
||||
// "DisableStupidWarnings.h" was included twice recursively: Do not reenable warnings yet!
|
||||
# undef EIGEN_WARNINGS_DISABLED_2
|
||||
|
||||
#elif defined(EIGEN_WARNINGS_DISABLED)
|
||||
#undef EIGEN_WARNINGS_DISABLED
|
||||
|
||||
#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
|
||||
|
||||
@@ -34,6 +34,20 @@ inline IndexDest convert_index(const IndexSrc& idx) {
|
||||
return IndexDest(idx);
|
||||
}
|
||||
|
||||
// true if T can be considered as an integral index (i.e., and integral type or enum)
|
||||
template<typename T> struct is_valid_index_type
|
||||
{
|
||||
enum { value =
|
||||
#if EIGEN_HAS_TYPE_TRAITS
|
||||
internal::is_integral<T>::value || std::is_enum<T>::value
|
||||
#elif EIGEN_COMP_MSVC
|
||||
internal::is_integral<T>::value || __is_enum(T)
|
||||
#else
|
||||
// without C++11, we use is_convertible to Index instead of is_integral in order to treat enums as Index.
|
||||
internal::is_convertible<T,Index>::value && !internal::is_same<T,float>::value && !is_same<T,double>::value
|
||||
#endif
|
||||
};
|
||||
};
|
||||
|
||||
// promote_scalar_arg is an helper used in operation between an expression and a scalar, like:
|
||||
// expression * scalar
|
||||
@@ -90,6 +104,9 @@ class no_assignment_operator
|
||||
{
|
||||
private:
|
||||
no_assignment_operator& operator=(const no_assignment_operator&);
|
||||
protected:
|
||||
EIGEN_DEFAULT_COPY_CONSTRUCTOR(no_assignment_operator)
|
||||
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(no_assignment_operator)
|
||||
};
|
||||
|
||||
/** \internal return the index type with the largest number of bits */
|
||||
|
||||
@@ -300,10 +300,13 @@ typename ComplexSchur<MatrixType>::ComplexScalar ComplexSchur<MatrixType>::compu
|
||||
ComplexScalar trace = t.coeff(0,0) + t.coeff(1,1);
|
||||
ComplexScalar eival1 = (trace + disc) / RealScalar(2);
|
||||
ComplexScalar eival2 = (trace - disc) / RealScalar(2);
|
||||
|
||||
if(numext::norm1(eival1) > numext::norm1(eival2))
|
||||
RealScalar eival1_norm = numext::norm1(eival1);
|
||||
RealScalar eival2_norm = numext::norm1(eival2);
|
||||
// A division by zero can only occur if eival1==eival2==0.
|
||||
// In this case, det==0, and all we have to do is checking that eival2_norm!=0
|
||||
if(eival1_norm > eival2_norm)
|
||||
eival2 = det / eival1;
|
||||
else
|
||||
else if(eival2_norm!=RealScalar(0))
|
||||
eival1 = det / eival2;
|
||||
|
||||
// choose the eigenvalue closest to the bottom entry of the diagonal
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
#ifndef EIGEN_MATRIXBASEEIGENVALUES_H
|
||||
#define EIGEN_MATRIXBASEEIGENVALUES_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
@@ -42,13 +42,13 @@ struct eigenvalues_selector<Derived, false>
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
/** \brief Computes the eigenvalues of a matrix
|
||||
/** \brief Computes the eigenvalues of a matrix
|
||||
* \returns Column vector containing the eigenvalues.
|
||||
*
|
||||
* \eigenvalues_module
|
||||
* This function computes the eigenvalues with the help of the EigenSolver
|
||||
* class (for real matrices) or the ComplexEigenSolver class (for complex
|
||||
* matrices).
|
||||
* matrices).
|
||||
*
|
||||
* The eigenvalues are repeated according to their algebraic multiplicity,
|
||||
* so there are as many eigenvalues as rows in the matrix.
|
||||
@@ -83,8 +83,8 @@ MatrixBase<Derived>::eigenvalues() const
|
||||
*
|
||||
* \sa SelfAdjointEigenSolver::eigenvalues(), MatrixBase::eigenvalues()
|
||||
*/
|
||||
template<typename MatrixType, unsigned int UpLo>
|
||||
inline typename SelfAdjointView<MatrixType, UpLo>::EigenvaluesReturnType
|
||||
template<typename MatrixType, unsigned int UpLo>
|
||||
EIGEN_DEVICE_FUNC inline typename SelfAdjointView<MatrixType, UpLo>::EigenvaluesReturnType
|
||||
SelfAdjointView<MatrixType, UpLo>::eigenvalues() const
|
||||
{
|
||||
PlainObject thisAsMatrix(*this);
|
||||
@@ -147,7 +147,7 @@ MatrixBase<Derived>::operatorNorm() const
|
||||
* \sa eigenvalues(), MatrixBase::operatorNorm()
|
||||
*/
|
||||
template<typename MatrixType, unsigned int UpLo>
|
||||
inline typename SelfAdjointView<MatrixType, UpLo>::RealScalar
|
||||
EIGEN_DEVICE_FUNC inline typename SelfAdjointView<MatrixType, UpLo>::RealScalar
|
||||
SelfAdjointView<MatrixType, UpLo>::operatorNorm() const
|
||||
{
|
||||
return eigenvalues().cwiseAbs().maxCoeff();
|
||||
|
||||
@@ -236,7 +236,7 @@ template<typename _MatrixType> class RealSchur
|
||||
typedef Matrix<Scalar,3,1> Vector3s;
|
||||
|
||||
Scalar computeNormOfT();
|
||||
Index findSmallSubdiagEntry(Index iu);
|
||||
Index findSmallSubdiagEntry(Index iu, const Scalar& considerAsZero);
|
||||
void splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift);
|
||||
void computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo);
|
||||
void initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector);
|
||||
@@ -302,12 +302,16 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
|
||||
Index totalIter = 0; // iteration count for whole matrix
|
||||
Scalar exshift(0); // sum of exceptional shifts
|
||||
Scalar norm = computeNormOfT();
|
||||
// sub-diagonal entries smaller than considerAsZero will be treated as zero.
|
||||
// We use eps^2 to enable more precision in small eigenvalues.
|
||||
Scalar considerAsZero = numext::maxi<Scalar>( norm * numext::abs2(NumTraits<Scalar>::epsilon()),
|
||||
(std::numeric_limits<Scalar>::min)() );
|
||||
|
||||
if(norm!=Scalar(0))
|
||||
{
|
||||
while (iu >= 0)
|
||||
{
|
||||
Index il = findSmallSubdiagEntry(iu);
|
||||
Index il = findSmallSubdiagEntry(iu,considerAsZero);
|
||||
|
||||
// Check for convergence
|
||||
if (il == iu) // One root found
|
||||
@@ -364,14 +368,17 @@ inline typename MatrixType::Scalar RealSchur<MatrixType>::computeNormOfT()
|
||||
|
||||
/** \internal Look for single small sub-diagonal element and returns its index */
|
||||
template<typename MatrixType>
|
||||
inline Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu)
|
||||
inline Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu, const Scalar& considerAsZero)
|
||||
{
|
||||
using std::abs;
|
||||
Index res = iu;
|
||||
while (res > 0)
|
||||
{
|
||||
Scalar s = abs(m_matT.coeff(res-1,res-1)) + abs(m_matT.coeff(res,res));
|
||||
if (abs(m_matT.coeff(res,res-1)) <= NumTraits<Scalar>::epsilon() * s)
|
||||
|
||||
s = numext::maxi<Scalar>(s * NumTraits<Scalar>::epsilon(), considerAsZero);
|
||||
|
||||
if (abs(m_matT.coeff(res,res-1)) <= s)
|
||||
break;
|
||||
res--;
|
||||
}
|
||||
|
||||
@@ -605,7 +605,8 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline bool extract_kernel(MatrixType& mat, Ref<VectorType> res, Ref<VectorType> representative)
|
||||
{
|
||||
using std::abs;
|
||||
EIGEN_USING_STD_MATH(sqrt)
|
||||
EIGEN_USING_STD_MATH(abs)
|
||||
Index i0;
|
||||
// Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal):
|
||||
mat.diagonal().cwiseAbs().maxCoeff(&i0);
|
||||
@@ -616,8 +617,8 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
|
||||
VectorType c0, c1;
|
||||
n0 = (c0 = representative.cross(mat.col((i0+1)%3))).squaredNorm();
|
||||
n1 = (c1 = representative.cross(mat.col((i0+2)%3))).squaredNorm();
|
||||
if(n0>n1) res = c0/std::sqrt(n0);
|
||||
else res = c1/std::sqrt(n1);
|
||||
if(n0>n1) res = c0/sqrt(n0);
|
||||
else res = c1/sqrt(n1);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -169,20 +169,38 @@ class QuaternionBase : public RotationBase<Derived, 3>
|
||||
/** return the result vector of \a v through the rotation*/
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Vector3 _transformVector(const Vector3& v) const;
|
||||
|
||||
#ifdef EIGEN_PARSED_BY_DOXYGEN
|
||||
/** \returns \c *this with scalar type casted to \a NewScalarType
|
||||
*
|
||||
* Note that if \a NewScalarType is equal to the current scalar type of \c *this
|
||||
* then this function smartly returns a const reference to \c *this.
|
||||
*/
|
||||
template<typename NewScalarType>
|
||||
EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<Derived,Quaternion<NewScalarType> >::type cast() const
|
||||
EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<Derived,Quaternion<NewScalarType> >::type cast() const;
|
||||
|
||||
#else
|
||||
|
||||
template<typename NewScalarType>
|
||||
EIGEN_DEVICE_FUNC inline
|
||||
typename internal::enable_if<internal::is_same<Scalar,NewScalarType>::value,const Derived&>::type cast() const
|
||||
{
|
||||
return typename internal::cast_return_type<Derived,Quaternion<NewScalarType> >::type(derived());
|
||||
return derived();
|
||||
}
|
||||
|
||||
template<typename NewScalarType>
|
||||
EIGEN_DEVICE_FUNC inline
|
||||
typename internal::enable_if<!internal::is_same<Scalar,NewScalarType>::value,Quaternion<NewScalarType> >::type cast() const
|
||||
{
|
||||
return Quaternion<NewScalarType>(coeffs().template cast<NewScalarType>());
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_QUATERNIONBASE_PLUGIN
|
||||
# include EIGEN_QUATERNIONBASE_PLUGIN
|
||||
#endif
|
||||
protected:
|
||||
EIGEN_DEFAULT_COPY_CONSTRUCTOR(QuaternionBase)
|
||||
EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(QuaternionBase)
|
||||
};
|
||||
|
||||
/***************************************************************************
|
||||
|
||||
2
Eigen/src/Geometry/Scaling.h
Executable file → Normal file
2
Eigen/src/Geometry/Scaling.h
Executable file → Normal file
@@ -14,7 +14,7 @@ namespace Eigen {
|
||||
|
||||
/** \geometry_module \ingroup Geometry_Module
|
||||
*
|
||||
* \class Scaling
|
||||
* \class UniformScaling
|
||||
*
|
||||
* \brief Represents a generic uniform scaling transformation
|
||||
*
|
||||
|
||||
@@ -252,11 +252,11 @@ protected:
|
||||
public:
|
||||
|
||||
/** Default constructor without initialization of the meaningful coefficients.
|
||||
* If Mode==Affine, then the last row is set to [0 ... 0 1] */
|
||||
* If Mode==Affine or Mode==Isometry, then the last row is set to [0 ... 0 1] */
|
||||
EIGEN_DEVICE_FUNC inline Transform()
|
||||
{
|
||||
check_template_params();
|
||||
internal::transform_make_affine<(int(Mode)==Affine) ? Affine : AffineCompact>::run(m_matrix);
|
||||
internal::transform_make_affine<(int(Mode)==Affine || int(Mode)==Isometry) ? Affine : AffineCompact>::run(m_matrix);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC inline Transform(const Transform& other)
|
||||
|
||||
@@ -138,12 +138,6 @@ public:
|
||||
/** \returns the inverse translation (opposite) */
|
||||
Translation inverse() const { return Translation(-m_coeffs); }
|
||||
|
||||
Translation& operator=(const Translation& other)
|
||||
{
|
||||
m_coeffs = other.m_coeffs;
|
||||
return *this;
|
||||
}
|
||||
|
||||
static const Translation Identity() { return Translation(VectorType::Zero()); }
|
||||
|
||||
/** \returns \c *this with scalar type casted to \a NewScalarType
|
||||
|
||||
@@ -87,7 +87,7 @@ struct umeyama_transform_matrix_type
|
||||
* \f{align*}
|
||||
* T = \begin{bmatrix} c\mathbf{R} & \mathbf{t} \\ \mathbf{0} & 1 \end{bmatrix}
|
||||
* \f}
|
||||
* minimizing the resudiual above. This transformation is always returned as an
|
||||
* minimizing the residual above. This transformation is always returned as an
|
||||
* Eigen::Matrix.
|
||||
*/
|
||||
template <typename Derived, typename OtherDerived>
|
||||
|
||||
@@ -443,7 +443,8 @@ typename internal::matrix_type_times_scalar_type<typename VectorsType::Scalar,Ot
|
||||
return res;
|
||||
}
|
||||
|
||||
/** \ingroup Householder_Module \householder_module
|
||||
/** \ingroup Householder_Module
|
||||
* \householder_module
|
||||
* \brief Convenience function for constructing a Householder sequence.
|
||||
* \returns A HouseholderSequence constructed from the specified arguments.
|
||||
*/
|
||||
@@ -453,7 +454,8 @@ HouseholderSequence<VectorsType,CoeffsType> householderSequence(const VectorsTyp
|
||||
return HouseholderSequence<VectorsType,CoeffsType,OnTheLeft>(v, h);
|
||||
}
|
||||
|
||||
/** \ingroup Householder_Module \householder_module
|
||||
/** \ingroup Householder_Module
|
||||
* \householder_module
|
||||
* \brief Convenience function for constructing a Householder sequence.
|
||||
* \returns A HouseholderSequence constructed from the specified arguments.
|
||||
* \details This function differs from householderSequence() in that the template argument \p OnTheSide of
|
||||
|
||||
@@ -53,7 +53,7 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
|
||||
* Output: \verbinclude class_FullPivLU.out
|
||||
*
|
||||
* This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
|
||||
*
|
||||
*
|
||||
* \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse()
|
||||
*/
|
||||
template<typename _MatrixType> class FullPivLU
|
||||
@@ -744,7 +744,7 @@ struct image_retval<FullPivLU<_MatrixType> >
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
template<typename _MatrixType>
|
||||
template<typename RhsType, typename DstType>
|
||||
void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
|
||||
EIGEN_DEVICE_FUNC void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
|
||||
{
|
||||
/* The decomposition PAQ = LU can be rewritten as A = P^{-1} L U Q^{-1}.
|
||||
* So we proceed as follows:
|
||||
@@ -792,7 +792,7 @@ void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
|
||||
|
||||
template<typename _MatrixType>
|
||||
template<bool Conjugate, typename RhsType, typename DstType>
|
||||
void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
|
||||
EIGEN_DEVICE_FUNC void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
|
||||
{
|
||||
/* The decomposition PAQ = LU can be rewritten as A = P^{-1} L U Q^{-1},
|
||||
* and since permutations are real and unitary, we can write this
|
||||
@@ -864,7 +864,7 @@ struct Assignment<DstXprType, Inverse<FullPivLU<MatrixType> >, internal::assign_
|
||||
{
|
||||
typedef FullPivLU<MatrixType> LuType;
|
||||
typedef Inverse<LuType> SrcXprType;
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename MatrixType::Scalar> &)
|
||||
static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename MatrixType::Scalar> &)
|
||||
{
|
||||
dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
#ifndef EIGEN_INVERSE_IMPL_H
|
||||
#define EIGEN_INVERSE_IMPL_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
@@ -72,7 +72,7 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 1>
|
||||
****************************/
|
||||
|
||||
template<typename MatrixType, typename ResultType>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline void compute_inverse_size2_helper(
|
||||
const MatrixType& matrix, const typename ResultType::Scalar& invdet,
|
||||
ResultType& result)
|
||||
@@ -122,7 +122,7 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 2>
|
||||
****************************/
|
||||
|
||||
template<typename MatrixType, int i, int j>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline typename MatrixType::Scalar cofactor_3x3(const MatrixType& m)
|
||||
{
|
||||
enum {
|
||||
@@ -200,7 +200,7 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 3>
|
||||
****************************/
|
||||
|
||||
template<typename Derived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const typename Derived::Scalar general_det3_helper
|
||||
(const MatrixBase<Derived>& matrix, int i1, int i2, int i3, int j1, int j2, int j3)
|
||||
{
|
||||
@@ -209,7 +209,7 @@ inline const typename Derived::Scalar general_det3_helper
|
||||
}
|
||||
|
||||
template<typename MatrixType, int i, int j>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline typename MatrixType::Scalar cofactor_4x4(const MatrixType& matrix)
|
||||
{
|
||||
enum {
|
||||
@@ -290,13 +290,13 @@ template<typename DstXprType, typename XprType>
|
||||
struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar>, Dense2Dense>
|
||||
{
|
||||
typedef Inverse<XprType> SrcXprType;
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar> &)
|
||||
static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar> &)
|
||||
{
|
||||
Index dstRows = src.rows();
|
||||
Index dstCols = src.cols();
|
||||
if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
|
||||
dst.resize(dstRows, dstCols);
|
||||
|
||||
|
||||
const int Size = EIGEN_PLAIN_ENUM_MIN(XprType::ColsAtCompileTime,DstXprType::ColsAtCompileTime);
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(Size);
|
||||
eigen_assert(( (Size<=1) || (Size>4) || (extract_data(src.nestedExpression())!=extract_data(dst)))
|
||||
@@ -304,14 +304,14 @@ struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename Dst
|
||||
|
||||
typedef typename internal::nested_eval<XprType,XprType::ColsAtCompileTime>::type ActualXprType;
|
||||
typedef typename internal::remove_all<ActualXprType>::type ActualXprTypeCleanded;
|
||||
|
||||
|
||||
ActualXprType actual_xpr(src.nestedExpression());
|
||||
|
||||
|
||||
compute_inverse<ActualXprTypeCleanded, DstXprType>::run(actual_xpr, dst);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
/** \lu_module
|
||||
|
||||
@@ -69,7 +69,7 @@ struct enable_if_ref<Ref<T>,Derived> {
|
||||
* The data of the LU decomposition can be directly accessed through the methods matrixLU(), permutationP().
|
||||
*
|
||||
* This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
|
||||
*
|
||||
*
|
||||
* \sa MatrixBase::partialPivLu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse(), class FullPivLU
|
||||
*/
|
||||
template<typename _MatrixType> class PartialPivLU
|
||||
@@ -519,7 +519,10 @@ void PartialPivLU<MatrixType>::compute()
|
||||
// the row permutation is stored as int indices, so just to be sure:
|
||||
eigen_assert(m_lu.rows()<NumTraits<int>::highest());
|
||||
|
||||
m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
|
||||
if(m_lu.cols()>0)
|
||||
m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
|
||||
else
|
||||
m_l1_norm = RealScalar(0);
|
||||
|
||||
eigen_assert(m_lu.rows() == m_lu.cols() && "PartialPivLU is only for square (and moreover invertible) matrices");
|
||||
const Index size = m_lu.rows();
|
||||
@@ -569,7 +572,7 @@ struct Assignment<DstXprType, Inverse<PartialPivLU<MatrixType> >, internal::assi
|
||||
{
|
||||
typedef PartialPivLU<MatrixType> LuType;
|
||||
typedef Inverse<LuType> SrcXprType;
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename LuType::Scalar> &)
|
||||
static EIGEN_DEVICE_FUNC void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename LuType::Scalar> &)
|
||||
{
|
||||
dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
|
||||
}
|
||||
|
||||
@@ -44,7 +44,7 @@ struct compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType>
|
||||
static void run(const MatrixType& mat, ResultType& result)
|
||||
{
|
||||
ActualMatrixType matrix(mat);
|
||||
EIGEN_ALIGN16 const unsigned int _Sign_PNNP[4] = { 0x00000000, 0x80000000, 0x80000000, 0x00000000 };
|
||||
const Packet4f p4f_sign_PNNP = _mm_castsi128_ps(_mm_set_epi32(0x00000000, 0x80000000, 0x80000000, 0x00000000));
|
||||
|
||||
// Load the full matrix into registers
|
||||
__m128 _L1 = matrix.template packet<MatrixAlignment>( 0);
|
||||
@@ -139,7 +139,7 @@ struct compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType>
|
||||
iC = _mm_sub_ps(iC, _mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66)));
|
||||
|
||||
rd = _mm_shuffle_ps(rd,rd,0);
|
||||
rd = _mm_xor_ps(rd, _mm_load_ps((float*)_Sign_PNNP));
|
||||
rd = _mm_xor_ps(rd, p4f_sign_PNNP);
|
||||
|
||||
// iB = C*|B| - D*B#*A
|
||||
iB = _mm_sub_ps(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB);
|
||||
|
||||
@@ -192,7 +192,8 @@ class PardisoImpl : public SparseSolverBase<Derived>
|
||||
void pardisoInit(int type)
|
||||
{
|
||||
m_type = type;
|
||||
bool symmetric = std::abs(m_type) < 10;
|
||||
EIGEN_USING_STD_MATH(abs);
|
||||
bool symmetric = abs(m_type) < 10;
|
||||
m_iparm[0] = 1; // No solver default
|
||||
m_iparm[1] = 2; // use Metis for the ordering
|
||||
m_iparm[2] = 0; // Reserved. Set to zero. (??Numbers of processors, value of OMP_NUM_THREADS??)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user