diff options
Diffstat (limited to '')
92 files changed, 0 insertions, 40984 deletions
diff --git a/src/EigenUnsupported/CXX11/CMakeLists.txt b/src/EigenUnsupported/CXX11/CMakeLists.txt deleted file mode 100644 index 385ed24..0000000 --- a/src/EigenUnsupported/CXX11/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -set(Eigen_CXX11_HEADERS Tensor TensorSymmetry ThreadPool) - -install(FILES - ${Eigen_CXX11_HEADERS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel - ) - -install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h") diff --git a/src/EigenUnsupported/CXX11/Tensor b/src/EigenUnsupported/CXX11/Tensor deleted file mode 100644 index 0938bb5..0000000 --- a/src/EigenUnsupported/CXX11/Tensor +++ /dev/null @@ -1,137 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -//#ifndef EIGEN_CXX11_TENSOR_MODULE -//#define EIGEN_CXX11_TENSOR_MODULE - -#include "../../../Eigen/Core" - -#if EIGEN_HAS_CXX11 - -#include "../SpecialFunctions" - -#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" -#include "src/util/CXX11Meta.h" -#include "src/util/MaxSizeVector.h" - -/** \defgroup CXX11_Tensor_Module Tensor Module - * - * This module provides a Tensor class for storing arbitrarily indexed - * objects. - * - * \code - * #include <Eigen/CXX11/Tensor> - * \endcode - * - * Much of the documentation can be found \ref eigen_tensors "here". - */ - -#include <atomic> -#include <chrono> -#include <cmath> -#include <cstddef> -#include <cstring> -#include <random> -#include <thread> - -#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL) -#include "ThreadPool" -#endif - -#ifdef EIGEN_USE_GPU - #include <iostream> - #if defined(EIGEN_USE_HIP) - #include <hip/hip_runtime.h> - #else - #include <cuda_runtime.h> - #endif -#endif - -#include "src/Tensor/TensorMacros.h" -#include "src/Tensor/TensorForwardDeclarations.h" -#include "src/Tensor/TensorMeta.h" -#include "src/Tensor/TensorFunctors.h" -#include "src/Tensor/TensorCostModel.h" -#include "src/Tensor/TensorDeviceDefault.h" -#include "src/Tensor/TensorDeviceThreadPool.h" -#include "src/Tensor/TensorDeviceGpu.h" -#ifndef gpu_assert -#define gpu_assert(x) -#endif -#include "src/Tensor/TensorDeviceSycl.h" -#include "src/Tensor/TensorIndexList.h" -#include "src/Tensor/TensorDimensionList.h" -#include "src/Tensor/TensorDimensions.h" -#include "src/Tensor/TensorInitializer.h" -#include "src/Tensor/TensorTraits.h" -#include "src/Tensor/TensorRandom.h" -#include "src/Tensor/TensorUInt128.h" -#include "src/Tensor/TensorIntDiv.h" -#include "src/Tensor/TensorGlobalFunctions.h" - -#include "src/Tensor/TensorBase.h" -#include "src/Tensor/TensorBlock.h" - -#include "src/Tensor/TensorEvaluator.h" -#include "src/Tensor/TensorExpr.h" -#include "src/Tensor/TensorReduction.h" -#include "src/Tensor/TensorReductionGpu.h" -#include "src/Tensor/TensorArgMax.h" -#include "src/Tensor/TensorConcatenation.h" -#include "src/Tensor/TensorContractionMapper.h" -#include "src/Tensor/TensorContractionBlocking.h" -#include "src/Tensor/TensorContraction.h" -#include "src/Tensor/TensorContractionThreadPool.h" -#include "src/Tensor/TensorContractionGpu.h" -#include "src/Tensor/TensorConversion.h" -#include "src/Tensor/TensorConvolution.h" -#include "src/Tensor/TensorFFT.h" -#include "src/Tensor/TensorPatch.h" -#include "src/Tensor/TensorImagePatch.h" -#include "src/Tensor/TensorVolumePatch.h" -#include "src/Tensor/TensorBroadcasting.h" -#include "src/Tensor/TensorChipping.h" -#include "src/Tensor/TensorInflation.h" -#include "src/Tensor/TensorLayoutSwap.h" -#include "src/Tensor/TensorMorphing.h" -#include "src/Tensor/TensorPadding.h" -#include "src/Tensor/TensorReverse.h" -#include "src/Tensor/TensorShuffling.h" -#include "src/Tensor/TensorStriding.h" -#include "src/Tensor/TensorCustomOp.h" -#include "src/Tensor/TensorEvalTo.h" -#include "src/Tensor/TensorForcedEval.h" -#include "src/Tensor/TensorGenerator.h" -#include "src/Tensor/TensorAssign.h" -#include "src/Tensor/TensorScan.h" -#include "src/Tensor/TensorTrace.h" - -#ifdef EIGEN_USE_SYCL -#include "src/Tensor/TensorReductionSycl.h" -#include "src/Tensor/TensorConvolutionSycl.h" -#include "src/Tensor/TensorContractionSycl.h" -#include "src/Tensor/TensorScanSycl.h" -#endif - -#include "src/Tensor/TensorExecutor.h" -#include "src/Tensor/TensorDevice.h" - -#include "src/Tensor/TensorStorage.h" -#include "src/Tensor/Tensor.h" -#include "src/Tensor/TensorFixedSize.h" -#include "src/Tensor/TensorMap.h" -#include "src/Tensor/TensorRef.h" - -#include "src/Tensor/TensorIO.h" - -#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" - -#endif // EIGEN_HAS_CXX11 -//#endif // EIGEN_CXX11_TENSOR_MODULE diff --git a/src/EigenUnsupported/CXX11/TensorSymmetry b/src/EigenUnsupported/CXX11/TensorSymmetry deleted file mode 100644 index b09c5e4..0000000 --- a/src/EigenUnsupported/CXX11/TensorSymmetry +++ /dev/null @@ -1,42 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE -#define EIGEN_CXX11_TENSORSYMMETRY_MODULE - -#include "Tensor" - -#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" - -#include "src/util/CXX11Meta.h" - -/** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module - * - * This module provides a classes that allow for the definition of - * symmetries w.r.t. tensor indices. - * - * Including this module will implicitly include the Tensor module. - * - * \code - * #include <Eigen/TensorSymmetry> - * \endcode - */ - -#include "src/TensorSymmetry/util/TemplateGroupTheory.h" -#include "src/TensorSymmetry/Symmetry.h" -#include "src/TensorSymmetry/StaticSymmetry.h" -#include "src/TensorSymmetry/DynamicSymmetry.h" - -#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" - -#endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE - -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ diff --git a/src/EigenUnsupported/CXX11/ThreadPool b/src/EigenUnsupported/CXX11/ThreadPool deleted file mode 100644 index c5cafb2..0000000 --- a/src/EigenUnsupported/CXX11/ThreadPool +++ /dev/null @@ -1,74 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_THREADPOOL_MODULE -#define EIGEN_CXX11_THREADPOOL_MODULE - -#include "../../../Eigen/Core" - -#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" - -/** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module - * - * This module provides 2 threadpool implementations - * - a simple reference implementation - * - a faster non blocking implementation - * - * This module requires C++11. - * - * \code - * #include <Eigen/CXX11/ThreadPool> - * \endcode - */ - - -// The code depends on CXX11, so only include the module if the -// compiler supports it. -#if (EIGEN_COMP_CXXVER >= 11) -#include <cstddef> -#include <cstring> -#include <time.h> - -#include <vector> -#include <atomic> -#include <condition_variable> -#include <deque> -#include <mutex> -#include <thread> -#include <functional> -#include <memory> -#include <utility> - -// There are non-parenthesized calls to "max" in the <unordered_map> header, -// which trigger a check in test/main.h causing compilation to fail. -// We work around the check here by removing the check for max in -// the case where we have to emulate thread_local. -#ifdef max -#undef max -#endif -#include <unordered_map> - -#include "src/util/CXX11Meta.h" -#include "src/util/MaxSizeVector.h" - -#include "src/ThreadPool/ThreadLocal.h" -#include "src/ThreadPool/ThreadYield.h" -#include "src/ThreadPool/ThreadCancel.h" -#include "src/ThreadPool/EventCount.h" -#include "src/ThreadPool/RunQueue.h" -#include "src/ThreadPool/ThreadPoolInterface.h" -#include "src/ThreadPool/ThreadEnvironment.h" -#include "src/ThreadPool/Barrier.h" -#include "src/ThreadPool/NonBlockingThreadPool.h" - -#endif - -#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" - -#endif // EIGEN_CXX11_THREADPOOL_MODULE diff --git a/src/EigenUnsupported/CXX11/src/Tensor/README.md b/src/EigenUnsupported/CXX11/src/Tensor/README.md deleted file mode 100644 index 2f65b1b..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/README.md +++ /dev/null @@ -1,1815 +0,0 @@ -# Eigen Tensors {#eigen_tensors} - -Tensors are multidimensional arrays of elements. Elements are typically scalars, -but more complex types such as strings are also supported. - -## Tensor Classes - -You can manipulate a tensor with one of the following classes. They all are in -the namespace `::Eigen.` - - -### Class Tensor<data_type, rank> - -This is the class to use to create a tensor and allocate memory for it. The -class is templatized with the tensor datatype, such as float or int, and the -tensor rank. The rank is the number of dimensions, for example rank 2 is a -matrix. - -Tensors of this class are resizable. For example, if you assign a tensor of a -different size to a Tensor, that tensor is resized to match its new value. - -#### Constructor Tensor<data_type, rank>(size0, size1, ...) - -Constructor for a Tensor. The constructor must be passed `rank` integers -indicating the sizes of the instance along each of the the `rank` -dimensions. - - // Create a tensor of rank 3 of sizes 2, 3, 4. This tensor owns - // memory to hold 24 floating point values (24 = 2 x 3 x 4). - Tensor<float, 3> t_3d(2, 3, 4); - - // Resize t_3d by assigning a tensor of different sizes, but same rank. - t_3d = Tensor<float, 3>(3, 4, 3); - -#### Constructor Tensor<data_type, rank>(size_array) - -Constructor where the sizes for the constructor are specified as an array of -values instead of an explicitly list of parameters. The array type to use is -`Eigen::array<Eigen::Index>`. The array can be constructed automatically -from an initializer list. - - // Create a tensor of strings of rank 2 with sizes 5, 7. - Tensor<string, 2> t_2d({5, 7}); - - -### Class TensorFixedSize<data_type, Sizes<size0, size1, ...>> - -Class to use for tensors of fixed size, where the size is known at compile -time. Fixed sized tensors can provide very fast computations because all their -dimensions are known by the compiler. FixedSize tensors are not resizable. - -If the total number of elements in a fixed size tensor is small enough the -tensor data is held onto the stack and does not cause heap allocation and free. - - // Create a 4 x 3 tensor of floats. - TensorFixedSize<float, Sizes<4, 3>> t_4x3; - -### Class TensorMap<Tensor<data_type, rank>> - -This is the class to use to create a tensor on top of memory allocated and -owned by another part of your code. It allows to view any piece of allocated -memory as a Tensor. Instances of this class do not own the memory where the -data are stored. - -A TensorMap is not resizable because it does not own the memory where its data -are stored. - -#### Constructor TensorMap<Tensor<data_type, rank>>(data, size0, size1, ...) - -Constructor for a Tensor. The constructor must be passed a pointer to the -storage for the data, and "rank" size attributes. The storage has to be -large enough to hold all the data. - - // Map a tensor of ints on top of stack-allocated storage. - int storage[128]; // 2 x 4 x 2 x 8 = 128 - TensorMap<Tensor<int, 4>> t_4d(storage, 2, 4, 2, 8); - - // The same storage can be viewed as a different tensor. - // You can also pass the sizes as an array. - TensorMap<Tensor<int, 2>> t_2d(storage, 16, 8); - - // You can also map fixed-size tensors. Here we get a 1d view of - // the 2d fixed-size tensor. - TensorFixedSize<float, Sizes<4, 3>> t_4x3; - TensorMap<Tensor<float, 1>> t_12(t_4x3.data(), 12); - - -#### Class TensorRef - -See Assigning to a TensorRef below. - -## Accessing Tensor Elements - -#### <data_type> tensor(index0, index1...) - -Return the element at position `(index0, index1...)` in tensor -`tensor`. You must pass as many parameters as the rank of `tensor`. -The expression can be used as an l-value to set the value of the element at the -specified position. The value returned is of the datatype of the tensor. - - // Set the value of the element at position (0, 1, 0); - Tensor<float, 3> t_3d(2, 3, 4); - t_3d(0, 1, 0) = 12.0f; - - // Initialize all elements to random values. - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 4; ++k) { - t_3d(i, j, k) = ...some random value...; - } - } - } - - // Print elements of a tensor. - for (int i = 0; i < 2; ++i) { - LOG(INFO) << t_3d(i, 0, 0); - } - - -## TensorLayout - -The tensor library supports 2 layouts: `ColMajor` (the default) and -`RowMajor`. Only the default column major layout is currently fully -supported, and it is therefore not recommended to attempt to use the row major -layout at the moment. - -The layout of a tensor is optionally specified as part of its type. If not -specified explicitly column major is assumed. - - Tensor<float, 3, ColMajor> col_major; // equivalent to Tensor<float, 3> - TensorMap<Tensor<float, 3, RowMajor> > row_major(data, ...); - -All the arguments to an expression must use the same layout. Attempting to mix -different layouts will result in a compilation error. - -It is possible to change the layout of a tensor or an expression using the -`swap_layout()` method. Note that this will also reverse the order of the -dimensions. - - Tensor<float, 2, ColMajor> col_major(2, 4); - Tensor<float, 2, RowMajor> row_major(2, 4); - - Tensor<float, 2> col_major_result = col_major; // ok, layouts match - Tensor<float, 2> col_major_result = row_major; // will not compile - - // Simple layout swap - col_major_result = row_major.swap_layout(); - eigen_assert(col_major_result.dimension(0) == 4); - eigen_assert(col_major_result.dimension(1) == 2); - - // Swap the layout and preserve the order of the dimensions - array<int, 2> shuffle(1, 0); - col_major_result = row_major.swap_layout().shuffle(shuffle); - eigen_assert(col_major_result.dimension(0) == 2); - eigen_assert(col_major_result.dimension(1) == 4); - - -## Tensor Operations - -The Eigen Tensor library provides a vast library of operations on Tensors: -numerical operations such as addition and multiplication, geometry operations -such as slicing and shuffling, etc. These operations are available as methods -of the Tensor classes, and in some cases as operator overloads. For example -the following code computes the elementwise addition of two tensors: - - Tensor<float, 3> t1(2, 3, 4); - ...set some values in t1... - Tensor<float, 3> t2(2, 3, 4); - ...set some values in t2... - // Set t3 to the element wise sum of t1 and t2 - Tensor<float, 3> t3 = t1 + t2; - -While the code above looks easy enough, it is important to understand that the -expression `t1 + t2` is not actually adding the values of the tensors. The -expression instead constructs a "tensor operator" object of the class -TensorCwiseBinaryOp<scalar_sum>, which has references to the tensors -`t1` and `t2`. This is a small C++ object that knows how to add -`t1` and `t2`. It is only when the value of the expression is assigned -to the tensor `t3` that the addition is actually performed. Technically, -this happens through the overloading of `operator=()` in the Tensor class. - -This mechanism for computing tensor expressions allows for lazy evaluation and -optimizations which are what make the tensor library very fast. - -Of course, the tensor operators do nest, and the expression `t1 + t2 * 0.3f` -is actually represented with the (approximate) tree of operators: - - TensorCwiseBinaryOp<scalar_sum>(t1, TensorCwiseUnaryOp<scalar_mul>(t2, 0.3f)) - - -### Tensor Operations and C++ "auto" - -Because Tensor operations create tensor operators, the C++ `auto` keyword -does not have its intuitive meaning. Consider these 2 lines of code: - - Tensor<float, 3> t3 = t1 + t2; - auto t4 = t1 + t2; - -In the first line we allocate the tensor `t3` and it will contain the -result of the addition of `t1` and `t2`. In the second line, `t4` -is actually the tree of tensor operators that will compute the addition of -`t1` and `t2`. In fact, `t4` is *not* a tensor and you cannot get -the values of its elements: - - Tensor<float, 3> t3 = t1 + t2; - cout << t3(0, 0, 0); // OK prints the value of t1(0, 0, 0) + t2(0, 0, 0) - - auto t4 = t1 + t2; - cout << t4(0, 0, 0); // Compilation error! - -When you use `auto` you do not get a Tensor as a result but instead a -non-evaluated expression. So only use `auto` to delay evaluation. - -Unfortunately, there is no single underlying concrete type for holding -non-evaluated expressions, hence you have to use auto in the case when you do -want to hold non-evaluated expressions. - -When you need the results of set of tensor computations you have to assign the -result to a Tensor that will be capable of holding onto them. This can be -either a normal Tensor, a fixed size Tensor, or a TensorMap on an existing -piece of memory. All the following will work: - - auto t4 = t1 + t2; - - Tensor<float, 3> result = t4; // Could also be: result(t4); - cout << result(0, 0, 0); - - TensorMap<float, 4> result(<a float* with enough space>, <size0>, ...) = t4; - cout << result(0, 0, 0); - - TensorFixedSize<float, Sizes<size0, ...>> result = t4; - cout << result(0, 0, 0); - -Until you need the results, you can keep the operation around, and even reuse -it for additional operations. As long as you keep the expression as an -operation, no computation is performed. - - // One way to compute exp((t1 + t2) * 0.2f); - auto t3 = t1 + t2; - auto t4 = t3 * 0.2f; - auto t5 = t4.exp(); - Tensor<float, 3> result = t5; - - // Another way, exactly as efficient as the previous one: - Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp(); - -### Controlling When Expression are Evaluated - -There are several ways to control when expressions are evaluated: - -* Assignment to a Tensor, TensorFixedSize, or TensorMap. -* Use of the eval() method. -* Assignment to a TensorRef. - -#### Assigning to a Tensor, TensorFixedSize, or TensorMap. - -The most common way to evaluate an expression is to assign it to a Tensor. In -the example below, the `auto` declarations make the intermediate values -"Operations", not Tensors, and do not cause the expressions to be evaluated. -The assignment to the Tensor `result` causes the evaluation of all the -operations. - - auto t3 = t1 + t2; // t3 is an Operation. - auto t4 = t3 * 0.2f; // t4 is an Operation. - auto t5 = t4.exp(); // t5 is an Operation. - Tensor<float, 3> result = t5; // The operations are evaluated. - -If you know the ranks and sizes of the Operation value you can assign the -Operation to a TensorFixedSize instead of a Tensor, which is a bit more -efficient. - - // We know that the result is a 4x4x2 tensor! - TensorFixedSize<float, Sizes<4, 4, 2>> result = t5; - -Simiarly, assigning an expression to a TensorMap causes its evaluation. Like -tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to -have the rank and sizes of the expression that are assigned to them. - -#### Calling eval(). - -When you compute large composite expressions, you sometimes want to tell Eigen -that an intermediate value in the expression tree is worth evaluating ahead of -time. This is done by inserting a call to the `eval()` method of the -expression Operation. - - // The previous example could have been written: - Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp(); - - // If you want to compute (t1 + t2) once ahead of time you can write: - Tensor<float, 3> result = ((t1 + t2).eval() * 0.2f).exp(); - -Semantically, calling `eval()` is equivalent to materializing the value of -the expression in a temporary Tensor of the right size. The code above in -effect does: - - // .eval() knows the size! - TensorFixedSize<float, Sizes<4, 4, 2>> tmp = t1 + t2; - Tensor<float, 3> result = (tmp * 0.2f).exp(); - -Note that the return value of `eval()` is itself an Operation, so the -following code does not do what you may think: - - // Here t3 is an evaluation Operation. t3 has not been evaluated yet. - auto t3 = (t1 + t2).eval(); - - // You can use t3 in another expression. Still no evaluation. - auto t4 = (t3 * 0.2f).exp(); - - // The value is evaluated when you assign the Operation to a Tensor, using - // an intermediate tensor to represent t3.x - Tensor<float, 3> result = t4; - -While in the examples above calling `eval()` does not make a difference in -performance, in other cases it can make a huge difference. In the expression -below the `broadcast()` expression causes the `X.maximum()` expression -to be evaluated many times: - - Tensor<...> X ...; - Tensor<...> Y = ((X - X.maximum(depth_dim).reshape(dims2d).broadcast(bcast)) - * beta).exp(); - -Inserting a call to `eval()` between the `maximum()` and -`reshape()` calls guarantees that maximum() is only computed once and -greatly speeds-up execution: - - Tensor<...> Y = - ((X - X.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) - * beta).exp(); - -In the other example below, the tensor `Y` is both used in the expression -and its assignment. This is an aliasing problem and if the evaluation is not -done in the right order Y will be updated incrementally during the evaluation -resulting in bogus results: - - Tensor<...> Y ...; - Y = Y / (Y.sum(depth_dim).reshape(dims2d).broadcast(bcast)); - -Inserting a call to `eval()` between the `sum()` and `reshape()` -expressions ensures that the sum is computed before any updates to `Y` are -done. - - Y = Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); - -Note that an eval around the full right hand side expression is not needed -because the generated has to compute the i-th value of the right hand side -before assigning it to the left hand side. - -However, if you were assigning the expression value to a shuffle of `Y` -then you would need to force an eval for correctness by adding an `eval()` -call for the right hand side: - - Y.shuffle(...) = - (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval(); - - -#### Assigning to a TensorRef. - -If you need to access only a few elements from the value of an expression you -can avoid materializing the value in a full tensor by using a TensorRef. - -A TensorRef is a small wrapper class for any Eigen Operation. It provides -overloads for the `()` operator that let you access individual values in -the expression. TensorRef is convenient, because the Operation themselves do -not provide a way to access individual elements. - - // Create a TensorRef for the expression. The expression is not - // evaluated yet. - TensorRef<Tensor<float, 3> > ref = ((t1 + t2) * 0.2f).exp(); - - // Use "ref" to access individual elements. The expression is evaluated - // on the fly. - float at_0 = ref(0, 0, 0); - cout << ref(0, 1, 0); - -Only use TensorRef when you need a subset of the values of the expression. -TensorRef only computes the values you access. However note that if you are -going to access all the values it will be much faster to materialize the -results in a Tensor first. - -In some cases, if the full Tensor result would be very large, you may save -memory by accessing it as a TensorRef. But not always. So don't count on it. - - -### Controlling How Expressions Are Evaluated - -The tensor library provides several implementations of the various operations -such as contractions and convolutions. The implementations are optimized for -different environments: single threaded on CPU, multi threaded on CPU, or on a -GPU using cuda. Additional implementations may be added later. - -You can choose which implementation to use with the `device()` call. If -you do not choose an implementation explicitly the default implementation that -uses a single thread on the CPU is used. - -The default implementation has been optimized for recent Intel CPUs, taking -advantage of SSE, AVX, and FMA instructions. Work is ongoing to tune the -library on ARM CPUs. Note that you need to pass compiler-dependent flags -to enable the use of SSE, AVX, and other instructions. - -For example, the following code adds two tensors using the default -single-threaded CPU implementation: - - Tensor<float, 2> a(30, 40); - Tensor<float, 2> b(30, 40); - Tensor<float, 2> c = a + b; - -To choose a different implementation you have to insert a `device()` call -before the assignment of the result. For technical C++ reasons this requires -that the Tensor for the result be declared on its own. This means that you -have to know the size of the result. - - Eigen::Tensor<float, 2> c(30, 40); - c.device(...) = a + b; - -The call to `device()` must be the last call on the left of the operator=. - -You must pass to the `device()` call an Eigen device object. There are -presently three devices you can use: DefaultDevice, ThreadPoolDevice and -GpuDevice. - - -#### Evaluating With the DefaultDevice - -This is exactly the same as not inserting a `device()` call. - - DefaultDevice my_device; - c.device(my_device) = a + b; - -#### Evaluating with a Thread Pool - - // Create the Eigen ThreadPool - Eigen::ThreadPool pool(8 /* number of threads in pool */) - - // Create the Eigen ThreadPoolDevice. - Eigen::ThreadPoolDevice my_device(&pool, 4 /* number of threads to use */); - - // Now just use the device when evaluating expressions. - Eigen::Tensor<float, 2> c(30, 50); - c.device(my_device) = a.contract(b, dot_product_dims); - - -#### Evaluating On GPU - -This is presently a bit more complicated than just using a thread pool device. -You need to create a GPU device but you also need to explicitly allocate the -memory for tensors with cuda. - - -## API Reference - -### Datatypes - -In the documentation of the tensor methods and Operation we mention datatypes -that are tensor-type specific: - -#### <Tensor-Type>::Dimensions - -Acts like an array of ints. Has an `int size` attribute, and can be -indexed like an array to access individual values. Used to represent the -dimensions of a tensor. See `dimensions()`. - -#### <Tensor-Type>::Index - -Acts like an `int`. Used for indexing tensors along their dimensions. See -`operator()`, `dimension()`, and `size()`. - -#### <Tensor-Type>::Scalar - -Represents the datatype of individual tensor elements. For example, for a -`Tensor<float>`, `Scalar` is the type `float`. See -`setConstant()`. - -#### <Operation> - -We use this pseudo type to indicate that a tensor Operation is returned by a -method. We indicate in the text the type and dimensions of the tensor that the -Operation returns after evaluation. - -The Operation will have to be evaluated, for example by assigning it to a -tensor, before you can access the values of the resulting tensor. You can also -access the values through a TensorRef. - - -## Built-in Tensor Methods - -These are usual C++ methods that act on tensors immediately. They are not -Operations which provide delayed evaluation of their results. Unless specified -otherwise, all the methods listed below are available on all tensor classes: -Tensor, TensorFixedSize, and TensorMap. - -## Metadata - -### int NumDimensions - -Constant value indicating the number of dimensions of a Tensor. This is also -known as the tensor "rank". - - Eigen::Tensor<float, 2> a(3, 4); - cout << "Dims " << a.NumDimensions; - => Dims 2 - -### Dimensions dimensions() - -Returns an array-like object representing the dimensions of the tensor. -The actual type of the `dimensions()` result is `<Tensor-Type>::``Dimensions`. - - Eigen::Tensor<float, 2> a(3, 4); - const Eigen::Tensor<float, 2>::Dimensions& d = a.dimensions(); - cout << "Dim size: " << d.size << ", dim 0: " << d[0] - << ", dim 1: " << d[1]; - => Dim size: 2, dim 0: 3, dim 1: 4 - -If you use a C++11 compiler, you can use `auto` to simplify the code: - - const auto& d = a.dimensions(); - cout << "Dim size: " << d.size << ", dim 0: " << d[0] - << ", dim 1: " << d[1]; - => Dim size: 2, dim 0: 3, dim 1: 4 - -### Index dimension(Index n) - -Returns the n-th dimension of the tensor. The actual type of the -`dimension()` result is `<Tensor-Type>::``Index`, but you can -always use it like an int. - - Eigen::Tensor<float, 2> a(3, 4); - int dim1 = a.dimension(1); - cout << "Dim 1: " << dim1; - => Dim 1: 4 - -### Index size() - -Returns the total number of elements in the tensor. This is the product of all -the tensor dimensions. The actual type of the `size()` result is -`<Tensor-Type>::``Index`, but you can always use it like an int. - - Eigen::Tensor<float, 2> a(3, 4); - cout << "Size: " << a.size(); - => Size: 12 - - -### Getting Dimensions From An Operation - -A few operations provide `dimensions()` directly, -e.g. `TensorReslicingOp`. Most operations defer calculating dimensions -until the operation is being evaluated. If you need access to the dimensions -of a deferred operation, you can wrap it in a TensorRef (see Assigning to a -TensorRef above), which provides `dimensions()` and `dimension()` as -above. - -TensorRef can also wrap the plain Tensor types, so this is a useful idiom in -templated contexts where the underlying object could be either a raw Tensor -or some deferred operation (e.g. a slice of a Tensor). In this case, the -template code can wrap the object in a TensorRef and reason about its -dimensionality while remaining agnostic to the underlying type. - - -## Constructors - -### Tensor - -Creates a tensor of the specified size. The number of arguments must be equal -to the rank of the tensor. The content of the tensor is not initialized. - - Eigen::Tensor<float, 2> a(3, 4); - cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; - => NumRows: 3 NumCols: 4 - -### TensorFixedSize - -Creates a tensor of the specified size. The number of arguments in the Sizes<> -template parameter determines the rank of the tensor. The content of the tensor -is not initialized. - - Eigen::TensorFixedSize<float, Sizes<3, 4>> a; - cout << "Rank: " << a.rank() << endl; - => Rank: 2 - cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; - => NumRows: 3 NumCols: 4 - -### TensorMap - -Creates a tensor mapping an existing array of data. The data must not be freed -until the TensorMap is discarded, and the size of the data must be large enough -to accommodate the coefficients of the tensor. - - float data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - Eigen::TensorMap<Tensor<float, 2>> a(data, 3, 4); - cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; - => NumRows: 3 NumCols: 4 - cout << "a(1, 2): " << a(1, 2) << endl; - => a(1, 2): 7 - - -## Contents Initialization - -When a new Tensor or a new TensorFixedSize are created, memory is allocated to -hold all the tensor elements, but the memory is not initialized. Similarly, -when a new TensorMap is created on top of non-initialized memory the memory its -contents are not initialized. - -You can use one of the methods below to initialize the tensor memory. These -have an immediate effect on the tensor and return the tensor itself as a -result. These are not tensor Operations which delay evaluation. - -### <Tensor-Type> setConstant(const Scalar& val) - -Sets all elements of the tensor to the constant value `val`. `Scalar` -is the type of data stored in the tensor. You can pass any value that is -convertible to that type. - -Returns the tensor itself in case you want to chain another call. - - a.setConstant(12.3f); - cout << "Constant: " << endl << a << endl << endl; - => - Constant: - 12.3 12.3 12.3 12.3 - 12.3 12.3 12.3 12.3 - 12.3 12.3 12.3 12.3 - -Note that `setConstant()` can be used on any tensor where the element type -has a copy constructor and an `operator=()`: - - Eigen::Tensor<string, 2> a(2, 3); - a.setConstant("yolo"); - cout << "String tensor: " << endl << a << endl << endl; - => - String tensor: - yolo yolo yolo - yolo yolo yolo - - -### <Tensor-Type> setZero() - -Fills the tensor with zeros. Equivalent to `setConstant(Scalar(0))`. -Returns the tensor itself in case you want to chain another call. - - a.setZero(); - cout << "Zeros: " << endl << a << endl << endl; - => - Zeros: - 0 0 0 0 - 0 0 0 0 - 0 0 0 0 - - -### <Tensor-Type> setValues({..initializer_list}) - -Fills the tensor with explicit values specified in a std::initializer_list. -The type of the initializer list depends on the type and rank of the tensor. - -If the tensor has rank N, the initializer list must be nested N times. The -most deeply nested lists must contains P scalars of the Tensor type where P is -the size of the last dimension of the Tensor. - -For example, for a `TensorFixedSize<float, 2, 3>` the initializer list must -contains 2 lists of 3 floats each. - -`setValues()` returns the tensor itself in case you want to chain another -call. - - Eigen::Tensor<float, 2> a(2, 3); - a.setValues({{0.0f, 1.0f, 2.0f}, {3.0f, 4.0f, 5.0f}}); - cout << "a" << endl << a << endl << endl; - => - a - 0 1 2 - 3 4 5 - -If a list is too short, the corresponding elements of the tensor will not be -changed. This is valid at each level of nesting. For example the following -code only sets the values of the first row of the tensor. - - Eigen::Tensor<int, 2> a(2, 3); - a.setConstant(1000); - a.setValues({{10, 20, 30}}); - cout << "a" << endl << a << endl << endl; - => - a - 10 20 30 - 1000 1000 1000 - -### <Tensor-Type> setRandom() - -Fills the tensor with random values. Returns the tensor itself in case you -want to chain another call. - - a.setRandom(); - cout << "Random: " << endl << a << endl << endl; - => - Random: - 0.680375 0.59688 -0.329554 0.10794 - -0.211234 0.823295 0.536459 -0.0452059 - 0.566198 -0.604897 -0.444451 0.257742 - -You can customize `setRandom()` by providing your own random number -generator as a template argument: - - a.setRandom<MyRandomGenerator>(); - -Here, `MyRandomGenerator` must be a struct with the following member -functions, where Scalar and Index are the same as `<Tensor-Type>::``Scalar` -and `<Tensor-Type>::``Index`. - -See `struct UniformRandomGenerator` in TensorFunctors.h for an example. - - // Custom number generator for use with setRandom(). - struct MyRandomGenerator { - // Default and copy constructors. Both are needed - MyRandomGenerator() { } - MyRandomGenerator(const MyRandomGenerator& ) { } - - // Return a random value to be used. "element_location" is the - // location of the entry to set in the tensor, it can typically - // be ignored. - Scalar operator()(Eigen::DenseIndex element_location, - Eigen::DenseIndex /*unused*/ = 0) const { - return <randomly generated value of type T>; - } - - // Same as above but generates several numbers at a time. - typename internal::packet_traits<Scalar>::type packetOp( - Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const { - return <a packet of randomly generated values>; - } - }; - -You can also use one of the 2 random number generators that are part of the -tensor library: -* UniformRandomGenerator -* NormalRandomGenerator - - -## Data Access - -The Tensor, TensorFixedSize, and TensorRef classes provide the following -accessors to access the tensor coefficients: - - const Scalar& operator()(const array<Index, NumIndices>& indices) - const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) - Scalar& operator()(const array<Index, NumIndices>& indices) - Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) - -The number of indices must be equal to the rank of the tensor. Moreover, these -accessors are not available on tensor expressions. In order to access the -values of a tensor expression, the expression must either be evaluated or -wrapped in a TensorRef. - - -### Scalar* data() and const Scalar* data() const - -Returns a pointer to the storage for the tensor. The pointer is const if the -tensor was const. This allows direct access to the data. The layout of the -data depends on the tensor layout: RowMajor or ColMajor. - -This access is usually only needed for special cases, for example when mixing -Eigen Tensor code with other libraries. - -Scalar is the type of data stored in the tensor. - - Eigen::Tensor<float, 2> a(3, 4); - float* a_data = a.data(); - a_data[0] = 123.45f; - cout << "a(0, 0): " << a(0, 0); - => a(0, 0): 123.45 - - -## Tensor Operations - -All the methods documented below return non evaluated tensor `Operations`. -These can be chained: you can apply another Tensor Operation to the value -returned by the method. - -The chain of Operation is evaluated lazily, typically when it is assigned to a -tensor. See "Controlling when Expression are Evaluated" for more details about -their evaluation. - -### <Operation> constant(const Scalar& val) - -Returns a tensor of the same type and dimensions as the original tensor but -where all elements have the value `val`. - -This is useful, for example, when you want to add or subtract a constant from a -tensor, or multiply every element of a tensor by a scalar. - - Eigen::Tensor<float, 2> a(2, 3); - a.setConstant(1.0f); - Eigen::Tensor<float, 2> b = a + a.constant(2.0f); - Eigen::Tensor<float, 2> c = b * b.constant(0.2f); - cout << "a" << endl << a << endl << endl; - cout << "b" << endl << b << endl << endl; - cout << "c" << endl << c << endl << endl; - => - a - 1 1 1 - 1 1 1 - - b - 3 3 3 - 3 3 3 - - c - 0.6 0.6 0.6 - 0.6 0.6 0.6 - -### <Operation> random() - -Returns a tensor of the same type and dimensions as the current tensor -but where all elements have random values. - -This is for example useful to add random values to an existing tensor. -The generation of random values can be customized in the same manner -as for `setRandom()`. - - Eigen::Tensor<float, 2> a(2, 3); - a.setConstant(1.0f); - Eigen::Tensor<float, 2> b = a + a.random(); - cout << "a" << endl << a << endl << endl; - cout << "b" << endl << b << endl << endl; - => - a - 1 1 1 - 1 1 1 - - b - 1.68038 1.5662 1.82329 - 0.788766 1.59688 0.395103 - - -## Unary Element Wise Operations - -All these operations take a single input tensor as argument and return a tensor -of the same type and dimensions as the tensor to which they are applied. The -requested operations are applied to each element independently. - -### <Operation> operator-() - -Returns a tensor of the same type and dimensions as the original tensor -containing the opposite values of the original tensor. - - Eigen::Tensor<float, 2> a(2, 3); - a.setConstant(1.0f); - Eigen::Tensor<float, 2> b = -a; - cout << "a" << endl << a << endl << endl; - cout << "b" << endl << b << endl << endl; - => - a - 1 1 1 - 1 1 1 - - b - -1 -1 -1 - -1 -1 -1 - -### <Operation> sqrt() - -Returns a tensor of the same type and dimensions as the original tensor -containing the square roots of the original tensor. - -### <Operation> rsqrt() - -Returns a tensor of the same type and dimensions as the original tensor -containing the inverse square roots of the original tensor. - -### <Operation> square() - -Returns a tensor of the same type and dimensions as the original tensor -containing the squares of the original tensor values. - -### <Operation> inverse() - -Returns a tensor of the same type and dimensions as the original tensor -containing the inverse of the original tensor values. - -### <Operation> exp() - -Returns a tensor of the same type and dimensions as the original tensor -containing the exponential of the original tensor. - -### <Operation> log() - -Returns a tensor of the same type and dimensions as the original tensor -containing the natural logarithms of the original tensor. - -### <Operation> abs() - -Returns a tensor of the same type and dimensions as the original tensor -containing the absolute values of the original tensor. - -### <Operation> pow(Scalar exponent) - -Returns a tensor of the same type and dimensions as the original tensor -containing the coefficients of the original tensor to the power of the -exponent. - -The type of the exponent, Scalar, is always the same as the type of the -tensor coefficients. For example, only integer exponents can be used in -conjuntion with tensors of integer values. - -You can use cast() to lift this restriction. For example this computes -cubic roots of an int Tensor: - - Eigen::Tensor<int, 2> a(2, 3); - a.setValues({{0, 1, 8}, {27, 64, 125}}); - Eigen::Tensor<double, 2> b = a.cast<double>().pow(1.0 / 3.0); - cout << "a" << endl << a << endl << endl; - cout << "b" << endl << b << endl << endl; - => - a - 0 1 8 - 27 64 125 - - b - 0 1 2 - 3 4 5 - -### <Operation> operator * (Scalar scale) - -Multiplies all the coefficients of the input tensor by the provided scale. - -### <Operation> cwiseMax(Scalar threshold) -TODO - -### <Operation> cwiseMin(Scalar threshold) -TODO - -### <Operation> unaryExpr(const CustomUnaryOp& func) -TODO - - -## Binary Element Wise Operations - -These operations take two input tensors as arguments. The 2 input tensors should -be of the same type and dimensions. The result is a tensor of the same -dimensions as the tensors to which they are applied, and unless otherwise -specified it is also of the same type. The requested operations are applied to -each pair of elements independently. - -### <Operation> operator+(const OtherDerived& other) - -Returns a tensor of the same type and dimensions as the input tensors -containing the coefficient wise sums of the inputs. - -### <Operation> operator-(const OtherDerived& other) - -Returns a tensor of the same type and dimensions as the input tensors -containing the coefficient wise differences of the inputs. - -### <Operation> operator*(const OtherDerived& other) - -Returns a tensor of the same type and dimensions as the input tensors -containing the coefficient wise products of the inputs. - -### <Operation> operator/(const OtherDerived& other) - -Returns a tensor of the same type and dimensions as the input tensors -containing the coefficient wise quotients of the inputs. - -This operator is not supported for integer types. - -### <Operation> cwiseMax(const OtherDerived& other) - -Returns a tensor of the same type and dimensions as the input tensors -containing the coefficient wise maximums of the inputs. - -### <Operation> cwiseMin(const OtherDerived& other) - -Returns a tensor of the same type and dimensions as the input tensors -containing the coefficient wise mimimums of the inputs. - -### <Operation> Logical operators - -The following logical operators are supported as well: - -* operator&&(const OtherDerived& other) -* operator||(const OtherDerived& other) -* operator<(const OtherDerived& other) -* operator<=(const OtherDerived& other) -* operator>(const OtherDerived& other) -* operator>=(const OtherDerived& other) -* operator==(const OtherDerived& other) -* operator!=(const OtherDerived& other) - -They all return a tensor of boolean values. - - -## Selection (select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) - -Selection is a coefficient-wise ternary operator that is the tensor equivalent -to the if-then-else operation. - - Tensor<bool, 3> if = ...; - Tensor<float, 3> then = ...; - Tensor<float, 3> else = ...; - Tensor<float, 3> result = if.select(then, else); - -The 3 arguments must be of the same dimensions, which will also be the dimension -of the result. The 'if' tensor must be of type boolean, the 'then' and the -'else' tensor must be of the same type, which will also be the type of the -result. - -Each coefficient in the result is equal to the corresponding coefficient in the -'then' tensor if the corresponding value in the 'if' tensor is true. If not, the -resulting coefficient will come from the 'else' tensor. - - -## Contraction - -Tensor *contractions* are a generalization of the matrix product to the -multidimensional case. - - // Create 2 matrices using tensors of rank 2 - Eigen::Tensor<int, 2> a(2, 3); - a.setValues({{1, 2, 3}, {6, 5, 4}}); - Eigen::Tensor<int, 2> b(3, 2); - b.setValues({{1, 2}, {4, 5}, {5, 6}}); - - // Compute the traditional matrix product - Eigen::array<Eigen::IndexPair<int>, 1> product_dims = { Eigen::IndexPair<int>(1, 0) }; - Eigen::Tensor<int, 2> AB = a.contract(b, product_dims); - - // Compute the product of the transpose of the matrices - Eigen::array<Eigen::IndexPair<int>, 1> transposed_product_dims = { Eigen::IndexPair<int>(0, 1) }; - Eigen::Tensor<int, 2> AtBt = a.contract(b, transposed_product_dims); - - // Contraction to scalar value using a double contraction. - // First coordinate of both tensors are contracted as well as both second coordinates, i.e., this computes the sum of the squares of the elements. - Eigen::array<Eigen::IndexPair<int>, 2> double_contraction_product_dims = { Eigen::IndexPair<int>(0, 0), Eigen::IndexPair<int>(1, 1) }; - Eigen::Tensor<int, 0> AdoubleContractedA = a.contract(a, double_contraction_product_dims); - - // Extracting the scalar value of the tensor contraction for further usage - int value = AdoubleContractedA(0); - -## Reduction Operations - -A *Reduction* operation returns a tensor with fewer dimensions than the -original tensor. The values in the returned tensor are computed by applying a -*reduction operator* to slices of values from the original tensor. You specify -the dimensions along which the slices are made. - -The Eigen Tensor library provides a set of predefined reduction operators such -as `maximum()` and `sum()` and lets you define additional operators by -implementing a few methods from a reductor template. - -### Reduction Dimensions - -All reduction operations take a single parameter of type -`<TensorType>::``Dimensions` which can always be specified as an array of -ints. These are called the "reduction dimensions." The values are the indices -of the dimensions of the input tensor over which the reduction is done. The -parameter can have at most as many element as the rank of the input tensor; -each element must be less than the tensor rank, as it indicates one of the -dimensions to reduce. - -Each dimension of the input tensor should occur at most once in the reduction -dimensions as the implementation does not remove duplicates. - -The order of the values in the reduction dimensions does not affect the -results, but the code may execute faster if you list the dimensions in -increasing order. - -Example: Reduction along one dimension. - - // Create a tensor of 2 dimensions - Eigen::Tensor<int, 2> a(2, 3); - a.setValues({{1, 2, 3}, {6, 5, 4}}); - // Reduce it along the second dimension (1)... - Eigen::array<int, 1> dims({1 /* dimension to reduce */}); - // ...using the "maximum" operator. - // The result is a tensor with one dimension. The size of - // that dimension is the same as the first (non-reduced) dimension of a. - Eigen::Tensor<int, 1> b = a.maximum(dims); - cout << "a" << endl << a << endl << endl; - cout << "b" << endl << b << endl << endl; - => - a - 1 2 3 - 6 5 4 - - b - 3 - 6 - -Example: Reduction along two dimensions. - - Eigen::Tensor<float, 3, Eigen::ColMajor> a(2, 3, 4); - a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, - {7.0f, 6.0f, 5.0f, 4.0f}, - {8.0f, 9.0f, 10.0f, 11.0f}}, - {{12.0f, 13.0f, 14.0f, 15.0f}, - {19.0f, 18.0f, 17.0f, 16.0f}, - {20.0f, 21.0f, 22.0f, 23.0f}}}); - // The tensor a has 3 dimensions. We reduce along the - // first 2, resulting in a tensor with a single dimension - // of size 4 (the last dimension of a.) - // Note that we pass the array of reduction dimensions - // directly to the maximum() call. - Eigen::Tensor<float, 1, Eigen::ColMajor> b = - a.maximum(Eigen::array<int, 2>({0, 1})); - cout << "b" << endl << b << endl << endl; - => - b - 20 - 21 - 22 - 23 - -#### Reduction along all dimensions - -As a special case, if you pass no parameter to a reduction operation the -original tensor is reduced along *all* its dimensions. The result is a -scalar, represented as a zero-dimension tensor. - - Eigen::Tensor<float, 3> a(2, 3, 4); - a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, - {7.0f, 6.0f, 5.0f, 4.0f}, - {8.0f, 9.0f, 10.0f, 11.0f}}, - {{12.0f, 13.0f, 14.0f, 15.0f}, - {19.0f, 18.0f, 17.0f, 16.0f}, - {20.0f, 21.0f, 22.0f, 23.0f}}}); - // Reduce along all dimensions using the sum() operator. - Eigen::Tensor<float, 0> b = a.sum(); - cout << "b" << endl << b << endl << endl; - => - b - 276 - - -### <Operation> sum(const Dimensions& new_dims) -### <Operation> sum() - -Reduce a tensor using the sum() operator. The resulting values -are the sum of the reduced values. - -### <Operation> mean(const Dimensions& new_dims) -### <Operation> mean() - -Reduce a tensor using the mean() operator. The resulting values -are the mean of the reduced values. - -### <Operation> maximum(const Dimensions& new_dims) -### <Operation> maximum() - -Reduce a tensor using the maximum() operator. The resulting values are the -largest of the reduced values. - -### <Operation> minimum(const Dimensions& new_dims) -### <Operation> minimum() - -Reduce a tensor using the minimum() operator. The resulting values -are the smallest of the reduced values. - -### <Operation> prod(const Dimensions& new_dims) -### <Operation> prod() - -Reduce a tensor using the prod() operator. The resulting values -are the product of the reduced values. - -### <Operation> all(const Dimensions& new_dims) -### <Operation> all() -Reduce a tensor using the all() operator. Casts tensor to bool and then checks -whether all elements are true. Runs through all elements rather than -short-circuiting, so may be significantly inefficient. - -### <Operation> any(const Dimensions& new_dims) -### <Operation> any() -Reduce a tensor using the any() operator. Casts tensor to bool and then checks -whether any element is true. Runs through all elements rather than -short-circuiting, so may be significantly inefficient. - - -### <Operation> reduce(const Dimensions& new_dims, const Reducer& reducer) - -Reduce a tensor using a user-defined reduction operator. See `SumReducer` -in TensorFunctors.h for information on how to implement a reduction operator. - - -## Trace - -A *Trace* operation returns a tensor with fewer dimensions than the original -tensor. It returns a tensor whose elements are the sum of the elements of the -original tensor along the main diagonal for a list of specified dimensions, the -"trace dimensions". Similar to the `Reduction Dimensions`, the trace dimensions -are passed as an input parameter to the operation, are of type `<TensorType>::``Dimensions` -, and have the same requirements when passed as an input parameter. In addition, -the trace dimensions must have the same size. - -Example: Trace along 2 dimensions. - - // Create a tensor of 3 dimensions - Eigen::Tensor<int, 3> a(2, 2, 3); - a.setValues({{{1, 2, 3}, {4, 5, 6}}, {{7, 8, 9}, {10, 11, 12}}}); - // Specify the dimensions along which the trace will be computed. - // In this example, the trace can only be computed along the dimensions - // with indices 0 and 1 - Eigen::array<int, 2> dims({0, 1}); - // The output tensor contains all but the trace dimensions. - Tensor<int, 1> a_trace = a.trace(dims); - cout << "a_trace:" << endl; - cout << a_trace << endl; - => - a_trace: - 11 - 13 - 15 - - -### <Operation> trace(const Dimensions& new_dims) -### <Operation> trace() - -As a special case, if no parameter is passed to the operation, trace is computed -along *all* dimensions of the input tensor. - -Example: Trace along all dimensions. - - // Create a tensor of 3 dimensions, with all dimensions having the same size. - Eigen::Tensor<int, 3> a(3, 3, 3); - a.setValues({{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, - {{10, 11, 12}, {13, 14, 15}, {16, 17, 18}}, - {{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}}); - // Result is a zero dimension tensor - Tensor<int, 0> a_trace = a.trace(); - cout<<"a_trace:"<<endl; - cout<<a_trace<<endl; - => - a_trace: - 42 - - -## Scan Operations - -A *Scan* operation returns a tensor with the same dimensions as the original -tensor. The operation performs an inclusive scan along the specified -axis, which means it computes a running total along the axis for a given -reduction operation. -If the reduction operation corresponds to summation, then this computes the -prefix sum of the tensor along the given axis. - -Example: -dd a comment to this line - - // Create a tensor of 2 dimensions - Eigen::Tensor<int, 2> a(2, 3); - a.setValues({{1, 2, 3}, {4, 5, 6}}); - // Scan it along the second dimension (1) using summation - Eigen::Tensor<int, 2> b = a.cumsum(1); - // The result is a tensor with the same size as the input - cout << "a" << endl << a << endl << endl; - cout << "b" << endl << b << endl << endl; - => - a - 1 2 3 - 4 5 6 - - b - 1 3 6 - 4 9 15 - -### <Operation> cumsum(const Index& axis) - -Perform a scan by summing consecutive entries. - -### <Operation> cumprod(const Index& axis) - -Perform a scan by multiplying consecutive entries. - - -## Convolutions - -### <Operation> convolve(const Kernel& kernel, const Dimensions& dims) - -Returns a tensor that is the output of the convolution of the input tensor with the kernel, -along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor -which were part of the convolution will be reduced by the formula: -output_dim_size = input_dim_size - kernel_dim_size + 1 (requires: input_dim_size >= kernel_dim_size). -The dimension sizes for dimensions that were not part of the convolution will remain the same. -Performance of the convolution can depend on the length of the stride(s) of the input tensor dimension(s) along which the -convolution is computed (the first dimension has the shortest stride for ColMajor, whereas RowMajor's shortest stride is -for the last dimension). - - // Compute convolution along the second and third dimension. - Tensor<float, 4, DataLayout> input(3, 3, 7, 11); - Tensor<float, 2, DataLayout> kernel(2, 2); - Tensor<float, 4, DataLayout> output(3, 2, 6, 11); - input.setRandom(); - kernel.setRandom(); - - Eigen::array<ptrdiff_t, 2> dims({1, 2}); // Specify second and third dimension for convolution. - output = input.convolve(kernel, dims); - - for (int i = 0; i < 3; ++i) { - for (int j = 0; j < 2; ++j) { - for (int k = 0; k < 6; ++k) { - for (int l = 0; l < 11; ++l) { - const float result = output(i,j,k,l); - const float expected = input(i,j+0,k+0,l) * kernel(0,0) + - input(i,j+1,k+0,l) * kernel(1,0) + - input(i,j+0,k+1,l) * kernel(0,1) + - input(i,j+1,k+1,l) * kernel(1,1); - VERIFY_IS_APPROX(result, expected); - } - } - } - } - - -## Geometrical Operations - -These operations return a Tensor with different dimensions than the original -Tensor. They can be used to access slices of tensors, see them with different -dimensions, or pad tensors with additional data. - -### <Operation> reshape(const Dimensions& new_dims) - -Returns a view of the input tensor that has been reshaped to the specified -new dimensions. The argument new_dims is an array of Index values. The -rank of the resulting tensor is equal to the number of elements in new_dims. - -The product of all the sizes in the new dimension array must be equal to -the number of elements in the input tensor. - - // Increase the rank of the input tensor by introducing a new dimension - // of size 1. - Tensor<float, 2> input(7, 11); - array<int, 3> three_dims{{7, 11, 1}}; - Tensor<float, 3> result = input.reshape(three_dims); - - // Decrease the rank of the input tensor by merging 2 dimensions; - array<int, 1> one_dim{{7 * 11}}; - Tensor<float, 1> result = input.reshape(one_dim); - -This operation does not move any data in the input tensor, so the resulting -contents of a reshaped Tensor depend on the data layout of the original Tensor. - -For example this is what happens when you `reshape()` a 2D ColMajor tensor -to one dimension: - - Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3); - a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); - Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2}); - Eigen::Tensor<float, 1, Eigen::ColMajor> b = a.reshape(one_dim); - cout << "b" << endl << b << endl; - => - b - 0 - 300 - 100 - 400 - 200 - 500 - -This is what happens when the 2D Tensor is RowMajor: - - Eigen::Tensor<float, 2, Eigen::RowMajor> a(2, 3); - a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); - Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2}); - Eigen::Tensor<float, 1, Eigen::RowMajor> b = a.reshape(one_dim); - cout << "b" << endl << b << endl; - => - b - 0 - 100 - 200 - 300 - 400 - 500 - -The reshape operation is a lvalue. In other words, it can be used on the left -side of the assignment operator. - -The previous example can be rewritten as follow: - - Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3); - a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); - Eigen::array<Eigen::DenseIndex, 2> two_dim({2, 3}); - Eigen::Tensor<float, 1, Eigen::ColMajor> b(6); - b.reshape(two_dim) = a; - cout << "b" << endl << b << endl; - => - b - 0 - 300 - 100 - 400 - 200 - 500 - -Note that "b" itself was not reshaped but that instead the assignment is done to -the reshape view of b. - - -### <Operation> shuffle(const Shuffle& shuffle) - -Returns a copy of the input tensor whose dimensions have been -reordered according to the specified permutation. The argument shuffle -is an array of Index values. Its size is the rank of the input -tensor. It must contain a permutation of 0, 1, ..., rank - 1. The i-th -dimension of the output tensor equals to the size of the shuffle[i]-th -dimension of the input tensor. For example: - - // Shuffle all dimensions to the left by 1. - Tensor<float, 3> input(20, 30, 50); - // ... set some values in input. - Tensor<float, 3> output = input.shuffle({1, 2, 0}) - - eigen_assert(output.dimension(0) == 30); - eigen_assert(output.dimension(1) == 50); - eigen_assert(output.dimension(2) == 20); - -Indices into the output tensor are shuffled accordingly to formulate -indices into the input tensor. For example, one can assert in the above -code snippet that: - - eigen_assert(output(3, 7, 11) == input(11, 3, 7)); - -In general, one can assert that - - eigen_assert(output(..., indices[shuffle[i]], ...) == - input(..., indices[i], ...)) - -The shuffle operation results in a lvalue, which means that it can be assigned -to. In other words, it can be used on the left side of the assignment operator. - -Let's rewrite the previous example to take advantage of this feature: - - // Shuffle all dimensions to the left by 1. - Tensor<float, 3> input(20, 30, 50); - // ... set some values in input. - Tensor<float, 3> output(30, 50, 20); - output.shuffle({2, 0, 1}) = input; - - -### <Operation> stride(const Strides& strides) - -Returns a view of the input tensor that strides (skips stride-1 -elements) along each of the dimensions. The argument strides is an -array of Index values. The dimensions of the resulting tensor are -ceil(input_dimensions[i] / strides[i]). - -For example this is what happens when you `stride()` a 2D tensor: - - Eigen::Tensor<int, 2> a(4, 3); - a.setValues({{0, 100, 200}, {300, 400, 500}, {600, 700, 800}, {900, 1000, 1100}}); - Eigen::array<Eigen::DenseIndex, 2> strides({3, 2}); - Eigen::Tensor<int, 2> b = a.stride(strides); - cout << "b" << endl << b << endl; - => - b - 0 200 - 900 1100 - -It is possible to assign a tensor to a stride: - Tensor<float, 3> input(20, 30, 50); - // ... set some values in input. - Tensor<float, 3> output(40, 90, 200); - output.stride({2, 3, 4}) = input; - - -### <Operation> slice(const StartIndices& offsets, const Sizes& extents) - -Returns a sub-tensor of the given tensor. For each dimension i, the slice is -made of the coefficients stored between offset[i] and offset[i] + extents[i] in -the input tensor. - - Eigen::Tensor<int, 2> a(4, 3); - a.setValues({{0, 100, 200}, {300, 400, 500}, - {600, 700, 800}, {900, 1000, 1100}}); - Eigen::array<int, 2> offsets = {1, 0}; - Eigen::array<int, 2> extents = {2, 2}; - Eigen::Tensor<int, 1> slice = a.slice(offsets, extents); - cout << "a" << endl << a << endl; - => - a - 0 100 200 - 300 400 500 - 600 700 800 - 900 1000 1100 - cout << "slice" << endl << slice << endl; - => - slice - 300 400 - 600 700 - - -### <Operation> chip(const Index offset, const Index dim) - -A chip is a special kind of slice. It is the subtensor at the given offset in -the dimension dim. The returned tensor has one fewer dimension than the input -tensor: the dimension dim is removed. - -For example, a matrix chip would be either a row or a column of the input -matrix. - - Eigen::Tensor<int, 2> a(4, 3); - a.setValues({{0, 100, 200}, {300, 400, 500}, - {600, 700, 800}, {900, 1000, 1100}}); - Eigen::Tensor<int, 1> row_3 = a.chip(2, 0); - Eigen::Tensor<int, 1> col_2 = a.chip(1, 1); - cout << "a" << endl << a << endl; - => - a - 0 100 200 - 300 400 500 - 600 700 800 - 900 1000 1100 - cout << "row_3" << endl << row_3 << endl; - => - row_3 - 600 700 800 - cout << "col_2" << endl << col_2 << endl; - => - col_2 - 100 400 700 1000 - -It is possible to assign values to a tensor chip since the chip operation is a -lvalue. For example: - - Eigen::Tensor<int, 1> a(3); - a.setValues({{100, 200, 300}}); - Eigen::Tensor<int, 2> b(2, 3); - b.setZero(); - b.chip(0, 0) = a; - cout << "a" << endl << a << endl; - => - a - 100 - 200 - 300 - cout << "b" << endl << b << endl; - => - b - 100 200 300 - 0 0 0 - - -### <Operation> reverse(const ReverseDimensions& reverse) - -Returns a view of the input tensor that reverses the order of the coefficients -along a subset of the dimensions. The argument reverse is an array of boolean -values that indicates whether or not the order of the coefficients should be -reversed along each of the dimensions. This operation preserves the dimensions -of the input tensor. - -For example this is what happens when you `reverse()` the first dimension -of a 2D tensor: - - Eigen::Tensor<int, 2> a(4, 3); - a.setValues({{0, 100, 200}, {300, 400, 500}, - {600, 700, 800}, {900, 1000, 1100}}); - Eigen::array<bool, 2> reverse({true, false}); - Eigen::Tensor<int, 2> b = a.reverse(reverse); - cout << "a" << endl << a << endl << "b" << endl << b << endl; - => - a - 0 100 200 - 300 400 500 - 600 700 800 - 900 1000 1100 - b - 900 1000 1100 - 600 700 800 - 300 400 500 - 0 100 200 - - -### <Operation> broadcast(const Broadcast& broadcast) - -Returns a view of the input tensor in which the input is replicated one to many -times. -The broadcast argument specifies how many copies of the input tensor need to be -made in each of the dimensions. - - Eigen::Tensor<int, 2> a(2, 3); - a.setValues({{0, 100, 200}, {300, 400, 500}}); - Eigen::array<int, 2> bcast({3, 2}); - Eigen::Tensor<int, 2> b = a.broadcast(bcast); - cout << "a" << endl << a << endl << "b" << endl << b << endl; - => - a - 0 100 200 - 300 400 500 - b - 0 100 200 0 100 200 - 300 400 500 300 400 500 - 0 100 200 0 100 200 - 300 400 500 300 400 500 - 0 100 200 0 100 200 - 300 400 500 300 400 500 - -### <Operation> concatenate(const OtherDerived& other, Axis axis) - -TODO - -### <Operation> pad(const PaddingDimensions& padding) - -Returns a view of the input tensor in which the input is padded with zeros. - - Eigen::Tensor<int, 2> a(2, 3); - a.setValues({{0, 100, 200}, {300, 400, 500}}); - Eigen::array<pair<int, int>, 2> paddings; - paddings[0] = make_pair(0, 1); - paddings[1] = make_pair(2, 3); - Eigen::Tensor<int, 2> b = a.pad(paddings); - cout << "a" << endl << a << endl << "b" << endl << b << endl; - => - a - 0 100 200 - 300 400 500 - b - 0 0 0 0 - 0 0 0 0 - 0 100 200 0 - 300 400 500 0 - 0 0 0 0 - 0 0 0 0 - 0 0 0 0 - - -### <Operation> extract_patches(const PatchDims& patch_dims) - -Returns a tensor of coefficient patches extracted from the input tensor, where -each patch is of dimension specified by 'patch_dims'. The returned tensor has -one greater dimension than the input tensor, which is used to index each patch. -The patch index in the output tensor depends on the data layout of the input -tensor: the patch index is the last dimension ColMajor layout, and the first -dimension in RowMajor layout. - -For example, given the following input tensor: - - Eigen::Tensor<float, 2, DataLayout> tensor(3,4); - tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f}, - {4.0f, 5.0f, 6.0f, 7.0f}, - {8.0f, 9.0f, 10.0f, 11.0f}}); - - cout << "tensor: " << endl << tensor << endl; - => - tensor: - 0 1 2 3 - 4 5 6 7 - 8 9 10 11 - -Six 2x2 patches can be extracted and indexed using the following code: - - Eigen::Tensor<float, 3, DataLayout> patch; - Eigen::array<ptrdiff_t, 2> patch_dims; - patch_dims[0] = 2; - patch_dims[1] = 2; - patch = tensor.extract_patches(patch_dims); - for (int k = 0; k < 6; ++k) { - cout << "patch index: " << k << endl; - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 2; ++j) { - if (DataLayout == ColMajor) { - cout << patch(i, j, k) << " "; - } else { - cout << patch(k, i, j) << " "; - } - } - cout << endl; - } - } - -This code results in the following output when the data layout is ColMajor: - - patch index: 0 - 0 1 - 4 5 - patch index: 1 - 4 5 - 8 9 - patch index: 2 - 1 2 - 5 6 - patch index: 3 - 5 6 - 9 10 - patch index: 4 - 2 3 - 6 7 - patch index: 5 - 6 7 - 10 11 - -This code results in the following output when the data layout is RowMajor: -(NOTE: the set of patches is the same as in ColMajor, but are indexed differently). - - patch index: 0 - 0 1 - 4 5 - patch index: 1 - 1 2 - 5 6 - patch index: 2 - 2 3 - 6 7 - patch index: 3 - 4 5 - 8 9 - patch index: 4 - 5 6 - 9 10 - patch index: 5 - 6 7 - 10 11 - -### <Operation> extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type) - -Returns a tensor of coefficient image patches extracted from the input tensor, -which is expected to have dimensions ordered as follows (depending on the data -layout of the input tensor, and the number of additional dimensions 'N'): - -*) ColMajor -1st dimension: channels (of size d) -2nd dimension: rows (of size r) -3rd dimension: columns (of size c) -4th-Nth dimension: time (for video) or batch (for bulk processing). - -*) RowMajor (reverse order of ColMajor) -1st-Nth dimension: time (for video) or batch (for bulk processing). -N+1'th dimension: columns (of size c) -N+2'th dimension: rows (of size r) -N+3'th dimension: channels (of size d) - -The returned tensor has one greater dimension than the input tensor, which is -used to index each patch. The patch index in the output tensor depends on the -data layout of the input tensor: the patch index is the 4'th dimension in -ColMajor layout, and the 4'th from the last dimension in RowMajor layout. - -For example, given the following input tensor with the following dimension -sizes: - *) depth: 2 - *) rows: 3 - *) columns: 5 - *) batch: 7 - - Tensor<float, 4> tensor(2,3,5,7); - Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout(); - -2x2 image patches can be extracted and indexed using the following code: - -*) 2D patch: ColMajor (patch indexed by second-to-last dimension) - - Tensor<float, 5> twod_patch; - twod_patch = tensor.extract_image_patches<2, 2>(); - // twod_patch.dimension(0) == 2 - // twod_patch.dimension(1) == 2 - // twod_patch.dimension(2) == 2 - // twod_patch.dimension(3) == 3*5 - // twod_patch.dimension(4) == 7 - -*) 2D patch: RowMajor (patch indexed by the second dimension) - - Tensor<float, 5, RowMajor> twod_patch_row_major; - twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>(); - // twod_patch_row_major.dimension(0) == 7 - // twod_patch_row_major.dimension(1) == 3*5 - // twod_patch_row_major.dimension(2) == 2 - // twod_patch_row_major.dimension(3) == 2 - // twod_patch_row_major.dimension(4) == 2 - -## Special Operations - -### <Operation> cast<T>() - -Returns a tensor of type T with the same dimensions as the original tensor. -The returned tensor contains the values of the original tensor converted to -type T. - - Eigen::Tensor<float, 2> a(2, 3); - Eigen::Tensor<int, 2> b = a.cast<int>(); - -This can be useful for example if you need to do element-wise division of -Tensors of integers. This is not currently supported by the Tensor library -but you can easily cast the tensors to floats to do the division: - - Eigen::Tensor<int, 2> a(2, 3); - a.setValues({{0, 1, 2}, {3, 4, 5}}); - Eigen::Tensor<int, 2> b = - (a.cast<float>() / a.constant(2).cast<float>()).cast<int>(); - cout << "a" << endl << a << endl << endl; - cout << "b" << endl << b << endl << endl; - => - a - 0 1 2 - 3 4 5 - - b - 0 0 1 - 1 2 2 - - -### <Operation> eval() - -TODO - - -## Representation of scalar values - -Scalar values are often represented by tensors of size 1 and rank 0.For example -Tensor<T, N>::maximum() currently returns a Tensor<T, 0>. Similarly, the inner -product of 2 1d tensors (through contractions) returns a 0d tensor. - -## Limitations - -* The number of tensor dimensions is currently limited to 250 when using a - compiler that supports cxx11. It is limited to only 5 for older compilers. -* The IndexList class requires a cxx11 compliant compiler. You can use an - array of indices instead if you don't have access to a modern compiler. -* On GPUs only floating point values are properly tested and optimized for. -* Complex and integer values are known to be broken on GPUs. If you try to use - them you'll most likely end up triggering a static assertion failure such as - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - - diff --git a/src/EigenUnsupported/CXX11/src/Tensor/Tensor.h b/src/EigenUnsupported/CXX11/src/Tensor/Tensor.h deleted file mode 100644 index 8cac2bb..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/Tensor.h +++ /dev/null @@ -1,554 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_H -#define EIGEN_CXX11_TENSOR_TENSOR_H - -namespace Eigen { - -/** \class Tensor - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor class. - * - * The %Tensor class is the work-horse for all \em dense tensors within Eigen. - * - * The %Tensor class encompasses only dynamic-size objects so far. - * - * The first two template parameters are required: - * \tparam Scalar_ Numeric type, e.g. float, double, int or `std::complex<float>`. - * User defined scalar types are supported as well (see \ref user_defined_scalars "here"). - * \tparam NumIndices_ Number of indices (i.e. rank of the tensor) - * - * The remaining template parameters are optional -- in most cases you don't have to worry about them. - * \tparam Options_ A combination of either \b #RowMajor or \b #ColMajor, and of either - * \b #AutoAlign or \b #DontAlign. - * The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required - * for vectorization. It defaults to aligning tensors. Note that tensors currently do not support any operations that profit from vectorization. - * Support for such operations (i.e. adding two tensors etc.) is planned. - * - * You can access elements of tensors using normal subscripting: - * - * \code - * Eigen::Tensor<double, 4> t(10, 10, 10, 10); - * t(0, 1, 2, 3) = 42.0; - * \endcode - * - * This class can be extended with the help of the plugin mechanism described on the page - * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN. - * - * <i><b>Some notes:</b></i> - * - * <dl> - * <dt><b>Relation to other parts of Eigen:</b></dt> - * <dd>The midterm development goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that - * taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code - * by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the %Tensor - * class does not provide any of these features and is only available as a stand-alone class that just allows for - * coefficient access. Also, when fixed-size tensors are implemented, the number of template arguments is likely to - * change dramatically.</dd> - * </dl> - * - * \ref TopicStorageOrders - */ - -template<typename Scalar_, int NumIndices_, int Options_, typename IndexType_> -class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > -{ - public: - typedef Tensor<Scalar_, NumIndices_, Options_, IndexType_> Self; - typedef TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > Base; - typedef typename Eigen::internal::nested<Self>::type Nested; - typedef typename internal::traits<Self>::StorageKind StorageKind; - typedef typename internal::traits<Self>::Index Index; - typedef Scalar_ Scalar; - typedef typename NumTraits<Scalar>::Real RealScalar; - typedef typename Base::CoeffReturnType CoeffReturnType; - - enum { - IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) & !(Options_&DontAlign), - Layout = Options_ & RowMajor ? RowMajor : ColMajor, - CoordAccess = true, - RawAccess = true - }; - - static const int Options = Options_; - static const int NumIndices = NumIndices_; - typedef DSizes<Index, NumIndices_> Dimensions; - - protected: - TensorStorage<Scalar, Dimensions, Options> m_storage; - -#ifdef EIGEN_HAS_SFINAE - template<typename CustomIndices> - struct isOfNormalIndex{ - static const bool is_array = internal::is_base_of<array<Index, NumIndices>, CustomIndices>::value; - static const bool is_int = NumTraits<CustomIndices>::IsInteger; - static const bool value = is_array | is_int; - }; -#endif - - public: - // Metadata - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } - - // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED - // work, because that uses base().coeffRef() - and we don't yet - // implement a similar class hierarchy - inline Self& base() { return *this; } - inline const Self& base() const { return *this; } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template<typename... IndexTypes> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeff(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); - } -#endif - - // normal indices - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const - { - eigen_internal_assert(checkIndexRange(indices)); - return m_storage.data()[linearizedIndex(indices)]; - } - - // custom indices -#ifdef EIGEN_HAS_SFINAE - template<typename CustomIndices, - EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) ) - > - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(CustomIndices& indices) const - { - return coeff(internal::customIndices2Array<Index,NumIndices>(indices)); - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return m_storage.data()[0]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const - { - eigen_internal_assert(index >= 0 && index < size()); - return m_storage.data()[index]; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template<typename... IndexTypes> - inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeffRef(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); - } -#endif - - // normal indices - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices) - { - eigen_internal_assert(checkIndexRange(indices)); - return m_storage.data()[linearizedIndex(indices)]; - } - - // custom indices -#ifdef EIGEN_HAS_SFINAE - template<typename CustomIndices, - EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) ) - > - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(CustomIndices& indices) - { - return coeffRef(internal::customIndices2Array<Index,NumIndices>(indices)); - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef() - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return m_storage.data()[0]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) - { - eigen_internal_assert(index >= 0 && index < size()); - return m_storage.data()[index]; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template<typename... IndexTypes> - inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return this->operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); - } -#else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const - { - return coeff(array<Index, 2>(i0, i1)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const - { - return coeff(array<Index, 3>(i0, i1, i2)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const - { - return coeff(array<Index, 4>(i0, i1, i2, i3)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const - { - return coeff(array<Index, 5>(i0, i1, i2, i3, i4)); - } -#endif - - // custom indices -#ifdef EIGEN_HAS_SFINAE - template<typename CustomIndices, - EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) ) - > - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(CustomIndices& indices) const - { - return coeff(internal::customIndices2Array<Index,NumIndices>(indices)); - } -#endif - - // normal indices - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const - { - return coeff(indices); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const - { - eigen_internal_assert(index >= 0 && index < size()); - return coeff(index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return coeff(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const - { - // The bracket operator is only for vectors, use the parenthesis operator instead. - EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); - return coeff(index); - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template<typename... IndexTypes> - inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); - } -#else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) - { - return coeffRef(array<Index, 2>(i0, i1)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) - { - return coeffRef(array<Index, 3>(i0, i1, i2)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) - { - return coeffRef(array<Index, 4>(i0, i1, i2, i3)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) - { - return coeffRef(array<Index, 5>(i0, i1, i2, i3, i4)); - } -#endif - - // normal indices - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices) - { - return coeffRef(indices); - } - - // custom indices -#ifdef EIGEN_HAS_SFINAE - template<typename CustomIndices, - EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) ) - > - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(CustomIndices& indices) - { - return coeffRef(internal::customIndices2Array<Index,NumIndices>(indices)); - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index) - { - eigen_assert(index >= 0 && index < size()); - return coeffRef(index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()() - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return coeffRef(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) - { - // The bracket operator is only for vectors, use the parenthesis operator instead - EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeffRef(index); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor() - : m_storage() - { - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor(const Self& other) - : m_storage(other.m_storage) - { - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template<typename... IndexTypes> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions) - : m_storage(firstDimension, otherDimensions...) - { - // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - } -#else - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1) - : m_storage(dim1, array<Index, 1>(dim1)) - { - EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2) - : m_storage(dim1*dim2, array<Index, 2>(dim1, dim2)) - { - EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3) - : m_storage(dim1*dim2*dim3, array<Index, 3>(dim1, dim2, dim3)) - { - EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4) - : m_storage(dim1*dim2*dim3*dim4, array<Index, 4>(dim1, dim2, dim3, dim4)) - { - EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) - : m_storage(dim1*dim2*dim3*dim4*dim5, array<Index, 5>(dim1, dim2, dim3, dim4, dim5)) - { - EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - } -#endif - - /** Normal Dimension */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(const array<Index, NumIndices>& dimensions) - : m_storage(internal::array_prod(dimensions), dimensions) - { - EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED - } - - template<typename OtherDerived> - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) - { - typedef TensorAssignOp<Tensor, const OtherDerived> Assign; - Assign assign(*this, other.derived()); - resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions()); - internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); - } - - template<typename OtherDerived> - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, WriteAccessors>& other) - { - typedef TensorAssignOp<Tensor, const OtherDerived> Assign; - Assign assign(*this, other.derived()); - resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions()); - internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); - } - - #if EIGEN_HAS_RVALUE_REFERENCES - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor(Self&& other) - : m_storage(std::move(other.m_storage)) - { - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor& operator=(Self&& other) - { - m_storage = std::move(other.m_storage); - return *this; - } - #endif - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) - { - typedef TensorAssignOp<Tensor, const Tensor> Assign; - Assign assign(*this, other); - resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions()); - internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); - return *this; - } - template<typename OtherDerived> - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other) - { - typedef TensorAssignOp<Tensor, const OtherDerived> Assign; - Assign assign(*this, other); - resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions()); - internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); - return *this; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template<typename... IndexTypes> EIGEN_DEVICE_FUNC - void resize(Index firstDimension, IndexTypes... otherDimensions) - { - // The number of dimensions used to resize a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - resize(array<Index, NumIndices>{{firstDimension, otherDimensions...}}); - } -#endif - - /** Normal Dimension */ - EIGEN_DEVICE_FUNC void resize(const array<Index, NumIndices>& dimensions) - { - int i; - Index size = Index(1); - for (i = 0; i < NumIndices; i++) { - internal::check_rows_cols_for_overflow<Dynamic>::run(size, dimensions[i]); - size *= dimensions[i]; - } - #ifdef EIGEN_INITIALIZE_COEFFS - bool size_changed = size != this->size(); - m_storage.resize(size, dimensions); - if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED - #else - m_storage.resize(size, dimensions); - #endif - } - - // Why this overload, DSizes is derived from array ??? // - EIGEN_DEVICE_FUNC void resize(const DSizes<Index, NumIndices>& dimensions) { - array<Index, NumIndices> dims; - for (int i = 0; i < NumIndices; ++i) { - dims[i] = dimensions[i]; - } - resize(dims); - } - - EIGEN_DEVICE_FUNC - void resize() - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - // Nothing to do: rank 0 tensors have fixed size - } - -#ifdef EIGEN_HAS_INDEX_LIST - template <typename FirstType, typename... OtherTypes> - EIGEN_DEVICE_FUNC - void resize(const Eigen::IndexList<FirstType, OtherTypes...>& dimensions) { - array<Index, NumIndices> dims; - for (int i = 0; i < NumIndices; ++i) { - dims[i] = static_cast<Index>(dimensions[i]); - } - resize(dims); - } -#endif - - /** Custom Dimension */ -#ifdef EIGEN_HAS_SFINAE - template<typename CustomDimension, - EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomDimension>::value) ) - > - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(CustomDimension& dimensions) - { - resize(internal::customIndices2Array<Index,NumIndices>(dimensions)); - } -#endif - -#ifndef EIGEN_EMULATE_CXX11_META_H - template <typename std::ptrdiff_t... Indices> - EIGEN_DEVICE_FUNC - void resize(const Sizes<Indices...>& dimensions) { - array<Index, NumIndices> dims; - for (int i = 0; i < NumIndices; ++i) { - dims[i] = static_cast<Index>(dimensions[i]); - } - resize(dims); - } -#else - template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> - EIGEN_DEVICE_FUNC - void resize(const Sizes<V1, V2, V3, V4, V5>& dimensions) { - array<Index, NumIndices> dims; - for (int i = 0; i < NumIndices; ++i) { - dims[i] = static_cast<Index>(dimensions[i]); - } - resize(dims); - } -#endif - - protected: - - bool checkIndexRange(const array<Index, NumIndices>& indices) const - { - using internal::array_apply_and_reduce; - using internal::array_zip_and_reduce; - using internal::greater_equal_zero_op; - using internal::logical_and_op; - using internal::lesser_op; - - return - // check whether the indices are all >= 0 - array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) && - // check whether the indices fit in the dimensions - array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const - { - if (Options&RowMajor) { - return m_storage.dimensions().IndexOfRowMajor(indices); - } else { - return m_storage.dimensions().IndexOfColMajor(indices); - } - } -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorArgMax.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorArgMax.h deleted file mode 100644 index 8b8fb92..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorArgMax.h +++ /dev/null @@ -1,329 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com> -// Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H -#define EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H - -namespace Eigen { -namespace internal { - -/** \class TensorIndexTuple - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor + Index Tuple class. - * - * - */ -template<typename XprType> -struct traits<TensorIndexTupleOp<XprType> > : public traits<XprType> -{ - typedef traits<XprType> XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef Tuple<Index, typename XprTraits::Scalar> Scalar; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template<typename XprType> -struct eval<TensorIndexTupleOp<XprType>, Eigen::Dense> -{ - typedef const TensorIndexTupleOp<XprType>EIGEN_DEVICE_REF type; -}; - -template<typename XprType> -struct nested<TensorIndexTupleOp<XprType>, 1, - typename eval<TensorIndexTupleOp<XprType> >::type> -{ - typedef TensorIndexTupleOp<XprType> type; -}; - -} // end namespace internal - -template<typename XprType> -class TensorIndexTupleOp : public TensorBase<TensorIndexTupleOp<XprType>, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits<TensorIndexTupleOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename Eigen::internal::nested<TensorIndexTupleOp>::type Nested; - typedef typename Eigen::internal::traits<TensorIndexTupleOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorIndexTupleOp>::Index Index; - typedef Tuple<Index, typename XprType::CoeffReturnType> CoeffReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIndexTupleOp(const XprType& expr) - : m_xpr(expr) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; -}; - -// Eval as rvalue -template<typename ArgType, typename Device> -struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> -{ - typedef TensorIndexTupleOp<ArgType> XprType; - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - - typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; - static const int NumDims = internal::array_size<Dimensions>::value; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false, - PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false, - BlockAccess = false, - PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { - return m_impl.dimensions(); - } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return CoeffReturnType(index, m_impl.coeff(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - -#ifdef EIGEN_USE_SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - - protected: - TensorEvaluator<ArgType, Device> m_impl; -}; - -namespace internal { - -/** \class TensorTupleIndex - * \ingroup CXX11_Tensor_Module - * - * \brief Converts to Tensor<Tuple<Index, Scalar> > and reduces to Tensor<Index>. - * - */ -template<typename ReduceOp, typename Dims, typename XprType> -struct traits<TensorTupleReducerOp<ReduceOp, Dims, XprType> > : public traits<XprType> -{ - typedef traits<XprType> XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef Index Scalar; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value; - static const int Layout = XprTraits::Layout; -}; - -template<typename ReduceOp, typename Dims, typename XprType> -struct eval<TensorTupleReducerOp<ReduceOp, Dims, XprType>, Eigen::Dense> -{ - typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>EIGEN_DEVICE_REF type; -}; - -template<typename ReduceOp, typename Dims, typename XprType> -struct nested<TensorTupleReducerOp<ReduceOp, Dims, XprType>, 1, - typename eval<TensorTupleReducerOp<ReduceOp, Dims, XprType> >::type> -{ - typedef TensorTupleReducerOp<ReduceOp, Dims, XprType> type; -}; - -} // end namespace internal - -template<typename ReduceOp, typename Dims, typename XprType> -class TensorTupleReducerOp : public TensorBase<TensorTupleReducerOp<ReduceOp, Dims, XprType>, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits<TensorTupleReducerOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename Eigen::internal::nested<TensorTupleReducerOp>::type Nested; - typedef typename Eigen::internal::traits<TensorTupleReducerOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorTupleReducerOp>::Index Index; - typedef Index CoeffReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerOp(const XprType& expr, - const ReduceOp& reduce_op, - const Index return_dim, - const Dims& reduce_dims) - : m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC - const ReduceOp& reduce_op() const { return m_reduce_op; } - - EIGEN_DEVICE_FUNC - const Dims& reduce_dims() const { return m_reduce_dims; } - - EIGEN_DEVICE_FUNC - Index return_dim() const { return m_return_dim; } - - protected: - typename XprType::Nested m_xpr; - const ReduceOp m_reduce_op; - const Index m_return_dim; - const Dims m_reduce_dims; -}; - -// Eval as rvalue -template<typename ReduceOp, typename Dims, typename ArgType, typename Device> -struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> -{ - typedef TensorTupleReducerOp<ReduceOp, Dims, ArgType> XprType; - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename TensorIndexTupleOp<ArgType>::CoeffReturnType TupleType; - typedef typename TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Dimensions Dimensions; - typedef typename TensorEvaluator<const TensorIndexTupleOp<ArgType> , Device>::Dimensions InputDimensions; - static const int NumDims = internal::array_size<InputDimensions>::value; - typedef array<Index, NumDims> StrideDims; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - typedef StorageMemory<TupleType, Device> TupleStorageMem; - - enum { - IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false, - PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false, - BlockAccess = false, - PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess, - Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_orig_impl(op.expression(), device), - m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), - m_return_dim(op.return_dim()) - { - gen_strides(m_orig_impl.dimensions(), m_strides); - if (Layout == static_cast<int>(ColMajor)) { - const Index total_size = internal::array_prod(m_orig_impl.dimensions()); - m_stride_mod = (m_return_dim < NumDims - 1) ? m_strides[m_return_dim + 1] : total_size; - } else { - const Index total_size = internal::array_prod(m_orig_impl.dimensions()); - m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size; - } - // If m_return_dim is not a valid index, returns 1 or this can crash on Windows. - m_stride_div = ((m_return_dim >= 0) && - (m_return_dim < static_cast<Index>(m_strides.size()))) - ? m_strides[m_return_dim] : 1; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { - return m_impl.dimensions(); - } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - const TupleType v = m_impl.coeff(index); - return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } -#ifdef EIGEN_USE_SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - m_orig_impl.bind(cgh); - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double compute_cost = 1.0 + - (m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost<Index>() + TensorOpCost::DivCost<Index>())); - return m_orig_impl.costPerCoeff(vectorized) + - m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost); - } - - private: - EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) { - if (m_return_dim < 0) { - return; // Won't be using the strides. - } - eigen_assert(m_return_dim < NumDims && - "Asking to convert index to a dimension outside of the rank"); - - // Calculate m_stride_div and m_stride_mod, which are used to - // calculate the value of an index w.r.t. the m_return_dim. - if (Layout == static_cast<int>(ColMajor)) { - strides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - strides[i] = strides[i-1] * dims[i-1]; - } - } else { - strides[NumDims-1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - strides[i] = strides[i+1] * dims[i+1]; - } - } - } - - protected: - TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> m_orig_impl; - TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device> m_impl; - const Index m_return_dim; - StrideDims m_strides; - Index m_stride_mod; - Index m_stride_div; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorAssign.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorAssign.h deleted file mode 100644 index e5811d6..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorAssign.h +++ /dev/null @@ -1,247 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H -#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H - -namespace Eigen { - -/** \class TensorAssign - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor assignment class. - * - * This class is represents the assignment of the values resulting from the evaluation of - * the rhs expression to the memory locations denoted by the lhs expression. - */ -namespace internal { -template<typename LhsXprType, typename RhsXprType> -struct traits<TensorAssignOp<LhsXprType, RhsXprType> > -{ - typedef typename LhsXprType::Scalar Scalar; - typedef typename traits<LhsXprType>::StorageKind StorageKind; - typedef typename promote_index_type<typename traits<LhsXprType>::Index, - typename traits<RhsXprType>::Index>::type Index; - typedef typename LhsXprType::Nested LhsNested; - typedef typename RhsXprType::Nested RhsNested; - typedef typename remove_reference<LhsNested>::type _LhsNested; - typedef typename remove_reference<RhsNested>::type _RhsNested; - static const std::size_t NumDimensions = internal::traits<LhsXprType>::NumDimensions; - static const int Layout = internal::traits<LhsXprType>::Layout; - typedef typename traits<LhsXprType>::PointerType PointerType; - - enum { - Flags = 0 - }; -}; - -template<typename LhsXprType, typename RhsXprType> -struct eval<TensorAssignOp<LhsXprType, RhsXprType>, Eigen::Dense> -{ - typedef const TensorAssignOp<LhsXprType, RhsXprType>& type; -}; - -template<typename LhsXprType, typename RhsXprType> -struct nested<TensorAssignOp<LhsXprType, RhsXprType>, 1, typename eval<TensorAssignOp<LhsXprType, RhsXprType> >::type> -{ - typedef TensorAssignOp<LhsXprType, RhsXprType> type; -}; - -} // end namespace internal - - - -template<typename LhsXprType, typename RhsXprType> -class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType> > -{ - public: - typedef typename Eigen::internal::traits<TensorAssignOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename LhsXprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested<TensorAssignOp>::type Nested; - typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index; - - static const int NumDims = Eigen::internal::traits<TensorAssignOp>::NumDimensions; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs) - : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {} - - /** \returns the nested expressions */ - EIGEN_DEVICE_FUNC - typename internal::remove_all<typename LhsXprType::Nested>::type& - lhsExpression() const { return *((typename internal::remove_all<typename LhsXprType::Nested>::type*)&m_lhs_xpr); } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename RhsXprType::Nested>::type& - rhsExpression() const { return m_rhs_xpr; } - - protected: - typename internal::remove_all<typename LhsXprType::Nested>::type& m_lhs_xpr; - const typename internal::remove_all<typename RhsXprType::Nested>::type& m_rhs_xpr; -}; - - -template<typename LeftArgType, typename RightArgType, typename Device> -struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device> -{ - typedef TensorAssignOp<LeftArgType, RightArgType> XprType; - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - static const int NumDims = XprType::NumDims; - - enum { - IsAligned = int(TensorEvaluator<LeftArgType, Device>::IsAligned) & - int(TensorEvaluator<RightArgType, Device>::IsAligned), - PacketAccess = int(TensorEvaluator<LeftArgType, Device>::PacketAccess) & - int(TensorEvaluator<RightArgType, Device>::PacketAccess), - BlockAccess = int(TensorEvaluator<LeftArgType, Device>::BlockAccess) & - int(TensorEvaluator<RightArgType, Device>::BlockAccess), - PreferBlockAccess = int(TensorEvaluator<LeftArgType, Device>::PreferBlockAccess) | - int(TensorEvaluator<RightArgType, Device>::PreferBlockAccess), - Layout = TensorEvaluator<LeftArgType, Device>::Layout, - RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; - - typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock - RightTensorBlock; - //===--------------------------------------------------------------------===// - - TensorEvaluator(const XprType& op, const Device& device) : - m_leftImpl(op.lhsExpression(), device), - m_rightImpl(op.rhsExpression(), device) - { - EIGEN_STATIC_ASSERT( - (static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == - static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), - YOU_MADE_A_PROGRAMMING_MISTAKE); - } - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const - { - // The dimensions of the lhs and the rhs tensors should be equal to prevent - // overflows and ensure the result is fully initialized. - // TODO: use left impl instead if right impl dimensions are known at compile time. - return m_rightImpl.dimensions(); - } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { - eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); - m_leftImpl.evalSubExprsIfNeeded(NULL); - // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non - // null value), attempt to evaluate the rhs expression in place. Returns true iff in place - // evaluation isn't supported and the caller still needs to manually assign the values generated - // by the rhs to the lhs. - return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data()); - } - -#ifdef EIGEN_USE_THREADS - template <typename EvalSubExprsCallback> - EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType, EvalSubExprsCallback done) { - m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) { - m_rightImpl.evalSubExprsIfNeededAsync( - m_leftImpl.data(), [done](bool need_assign) { done(need_assign); }); - }); - } -#endif // EIGEN_USE_THREADS - - EIGEN_STRONG_INLINE void cleanup() { - m_leftImpl.cleanup(); - m_rightImpl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { - m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { - - const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned; - const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned; - m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i)); - } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_leftImpl.coeff(index); - } - template<int LoadMode> - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const - { - return m_leftImpl.template packet<LoadMode>(index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - // We assume that evalPacket or evalScalar is called to perform the - // assignment and account for the cost of the write here, but reduce left - // cost by one load because we are using m_leftImpl.coeffRef. - TensorOpCost left = m_leftImpl.costPerCoeff(vectorized); - return m_rightImpl.costPerCoeff(vectorized) + - TensorOpCost( - numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)), - left.bytes_stored(), left.compute_cycles()) + - TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - return internal::TensorBlockResourceRequirements::merge( - m_leftImpl.getResourceRequirements(), - m_rightImpl.getResourceRequirements()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock( - TensorBlockDesc& desc, TensorBlockScratch& scratch) { - if (TensorEvaluator<LeftArgType, Device>::RawAccess && - m_leftImpl.data() != NULL) { - // If destination has raw data access, we pass it as a potential - // destination for a block descriptor evaluation. - desc.template AddDestinationBuffer<Layout>( - /*dst_base=*/m_leftImpl.data() + desc.offset(), - /*dst_strides=*/internal::strides<Layout>(m_leftImpl.dimensions())); - } - - RightTensorBlock block = m_rightImpl.block(desc, scratch, /*root_of_expr_ast=*/true); - // If block was evaluated into a destination, there is no need to do assignment. - if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) { - m_leftImpl.writeBlock(desc, block); - } - block.cleanup(); - } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_leftImpl.bind(cgh); - m_rightImpl.bind(cgh); - } -#endif - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); } - - private: - TensorEvaluator<LeftArgType, Device> m_leftImpl; - TensorEvaluator<RightArgType, Device> m_rightImpl; -}; - -} - - -#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorBase.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorBase.h deleted file mode 100644 index 35b6458..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorBase.h +++ /dev/null @@ -1,1176 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_BASE_H -#define EIGEN_CXX11_TENSOR_TENSOR_BASE_H - -// clang-format off - -namespace Eigen { - -/** \class TensorBase - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor base class. - * - * This class is the common parent of the Tensor and TensorMap class, thus - * making it possible to use either class interchangeably in expressions. - */ -#ifndef EIGEN_PARSED_BY_DOXYGEN -// FIXME Doxygen does not like the inheritance with different template parameters -// Since there is no doxygen documentation inside, we disable it for now -template<typename Derived> -class TensorBase<Derived, ReadOnlyAccessors> -{ - public: - typedef internal::traits<Derived> DerivedTraits; - typedef typename DerivedTraits::Scalar Scalar; - typedef typename DerivedTraits::Index Index; - typedef typename internal::remove_const<Scalar>::type CoeffReturnType; - static const int NumDimensions = DerivedTraits::NumDimensions; - - // Generic nullary operation support. - template <typename CustomNullaryOp> EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<CustomNullaryOp, const Derived> - nullaryExpr(const CustomNullaryOp& func) const { - return TensorCwiseNullaryOp<CustomNullaryOp, const Derived>(derived(), func); - } - - // Coefficient-wise nullary operators - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> - constant(const Scalar& value) const { - return nullaryExpr(internal::scalar_constant_op<Scalar>(value)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::UniformRandomGenerator<Scalar>, const Derived> - random() const { - return nullaryExpr(internal::UniformRandomGenerator<Scalar>()); - } - template <typename RandomGenerator> EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<RandomGenerator, const Derived> - random(const RandomGenerator& gen = RandomGenerator()) const { - return nullaryExpr(gen); - } - - // Tensor generation - template <typename Generator> EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorGeneratorOp<Generator, const Derived> - generate(const Generator& generator) const { - return TensorGeneratorOp<Generator, const Derived>(derived(), generator); - } - - // Generic unary operation support. - template <typename CustomUnaryOp> EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<CustomUnaryOp, const Derived> - unaryExpr(const CustomUnaryOp& func) const { - return TensorCwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func); - } - - // Coefficient-wise unary operators - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived> - operator-() const { - return unaryExpr(internal::scalar_opposite_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> - sqrt() const { - return unaryExpr(internal::scalar_sqrt_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> - sign() const { - return unaryExpr(internal::scalar_sign_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_rsqrt_op<Scalar>, const Derived> - rsqrt() const { - return unaryExpr(internal::scalar_rsqrt_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived> - square() const { - return unaryExpr(internal::scalar_square_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived> - cube() const { - return unaryExpr(internal::scalar_cube_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> - inverse() const { - return unaryExpr(internal::scalar_inverse_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived> - tanh() const { - return unaryExpr(internal::scalar_tanh_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived> - lgamma() const { - return unaryExpr(internal::scalar_lgamma_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived> - digamma() const { - return unaryExpr(internal::scalar_digamma_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i0_op<Scalar>, const Derived> - bessel_i0() const { - return unaryExpr(internal::scalar_bessel_i0_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i0e_op<Scalar>, const Derived> - bessel_i0e() const { - return unaryExpr(internal::scalar_bessel_i0e_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i1_op<Scalar>, const Derived> - bessel_i1() const { - return unaryExpr(internal::scalar_bessel_i1_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i1e_op<Scalar>, const Derived> - bessel_i1e() const { - return unaryExpr(internal::scalar_bessel_i1e_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_j0_op<Scalar>, const Derived> - bessel_j0() const { - return unaryExpr(internal::scalar_bessel_j0_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_y0_op<Scalar>, const Derived> - bessel_y0() const { - return unaryExpr(internal::scalar_bessel_y0_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_j1_op<Scalar>, const Derived> - bessel_j1() const { - return unaryExpr(internal::scalar_bessel_j1_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_y1_op<Scalar>, const Derived> - bessel_y1() const { - return unaryExpr(internal::scalar_bessel_y1_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k0_op<Scalar>, const Derived> - bessel_k0() const { - return unaryExpr(internal::scalar_bessel_k0_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k0e_op<Scalar>, const Derived> - bessel_k0e() const { - return unaryExpr(internal::scalar_bessel_k0e_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k1_op<Scalar>, const Derived> - bessel_k1() const { - return unaryExpr(internal::scalar_bessel_k1_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k1e_op<Scalar>, const Derived> - bessel_k1e() const { - return unaryExpr(internal::scalar_bessel_k1e_op<Scalar>()); - } - - // igamma(a = this, x = other) - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_igamma_op<Scalar>, const Derived, const OtherDerived> - igamma(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_igamma_op<Scalar>()); - } - - // igamma_der_a(a = this, x = other) - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_igamma_der_a_op<Scalar>, const Derived, const OtherDerived> - igamma_der_a(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_igamma_der_a_op<Scalar>()); - } - - // gamma_sample_der_alpha(alpha = this, sample = other) - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_gamma_sample_der_alpha_op<Scalar>, const Derived, const OtherDerived> - gamma_sample_der_alpha(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_gamma_sample_der_alpha_op<Scalar>()); - } - - // igammac(a = this, x = other) - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_igammac_op<Scalar>, const Derived, const OtherDerived> - igammac(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_igammac_op<Scalar>()); - } - - // zeta(x = this, q = other) - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_zeta_op<Scalar>, const Derived, const OtherDerived> - zeta(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_zeta_op<Scalar>()); - } - - // polygamma(n = this, x = other) - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_polygamma_op<Scalar>, const Derived, const OtherDerived> - polygamma(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_polygamma_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived> - erf() const { - return unaryExpr(internal::scalar_erf_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived> - erfc() const { - return unaryExpr(internal::scalar_erfc_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_ndtri_op<Scalar>, const Derived> - ndtri() const { - return unaryExpr(internal::scalar_ndtri_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_logistic_op<Scalar>, const Derived> - sigmoid() const { - return unaryExpr(internal::scalar_logistic_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived> - exp() const { - return unaryExpr(internal::scalar_exp_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_expm1_op<Scalar>, const Derived> - expm1() const { - return unaryExpr(internal::scalar_expm1_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived> - log() const { - return unaryExpr(internal::scalar_log_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived> - log1p() const { - return unaryExpr(internal::scalar_log1p_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log2_op<Scalar>, const Derived> - log2() const { - return unaryExpr(internal::scalar_log2_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> - abs() const { - return unaryExpr(internal::scalar_abs_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_clamp_op<Scalar>, const Derived> - clip(Scalar min, Scalar max) const { - return unaryExpr(internal::scalar_clamp_op<Scalar>(min, max)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const typename internal::conditional<NumTraits<CoeffReturnType>::IsComplex, - TensorCwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>, - Derived>::type - conjugate() const { - return choose(Cond<NumTraits<CoeffReturnType>::IsComplex>(), unaryExpr(internal::scalar_conjugate_op<Scalar>()), derived()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >, const Derived> - pow(Scalar exponent) const { - return unaryExpr(internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >(exponent)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_real_op<Scalar>, const Derived> - real() const { - return unaryExpr(internal::scalar_real_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_imag_op<Scalar>, const Derived> - imag() const { - return unaryExpr(internal::scalar_imag_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >, const Derived> - operator+ (Scalar rhs) const { - return unaryExpr(internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >(rhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE friend - const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_sum_op<Scalar> >, const Derived> - operator+ (Scalar lhs, const Derived& rhs) { - return rhs.unaryExpr(internal::bind1st_op<internal::scalar_sum_op<Scalar> >(lhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >, const Derived> - operator- (Scalar rhs) const { - EIGEN_STATIC_ASSERT((NumTraits<Scalar>::IsSigned || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE); - return unaryExpr(internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >(rhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE friend - const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_difference_op<Scalar> >, const Derived> - operator- (Scalar lhs, const Derived& rhs) { - return rhs.unaryExpr(internal::bind1st_op<internal::scalar_difference_op<Scalar> >(lhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >, const Derived> - operator* (Scalar rhs) const { - return unaryExpr(internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >(rhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE friend - const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_product_op<Scalar> >, const Derived> - operator* (Scalar lhs, const Derived& rhs) { - return rhs.unaryExpr(internal::bind1st_op<internal::scalar_product_op<Scalar> >(lhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >, const Derived> - operator/ (Scalar rhs) const { - return unaryExpr(internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >(rhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE friend - const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_quotient_op<Scalar> >, const Derived> - operator/ (Scalar lhs, const Derived& rhs) { - return rhs.unaryExpr(internal::bind1st_op<internal::scalar_quotient_op<Scalar> >(lhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_mod_op<Scalar>, const Derived> - operator% (Scalar rhs) const { - EIGEN_STATIC_ASSERT(NumTraits<Scalar>::IsInteger, YOU_MADE_A_PROGRAMMING_MISTAKE_TRY_MOD); - return unaryExpr(internal::scalar_mod_op<Scalar>(rhs)); - } - - template <int NanPropagation=PropagateFast> - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NanPropagation>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > - cwiseMax(Scalar threshold) const { - return cwiseMax<NanPropagation>(constant(threshold)); - } - - template <int NanPropagation=PropagateFast> - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NanPropagation>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > - cwiseMin(Scalar threshold) const { - return cwiseMin<NanPropagation>(constant(threshold)); - } - - template<typename NewType> - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const typename internal::conditional<internal::is_same<NewType, CoeffReturnType>::value, - Derived, - TensorConversionOp<NewType, const Derived> >::type - cast() const { - return choose(Cond<internal::is_same<NewType, CoeffReturnType>::value>(), derived(), TensorConversionOp<NewType, const Derived>(derived())); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_round_op<Scalar>, const Derived> - round() const { - return unaryExpr(internal::scalar_round_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_rint_op<Scalar>, const Derived> - rint() const { - return unaryExpr(internal::scalar_rint_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_ceil_op<Scalar>, const Derived> - ceil() const { - return unaryExpr(internal::scalar_ceil_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_floor_op<Scalar>, const Derived> - floor() const { - return unaryExpr(internal::scalar_floor_op<Scalar>()); - } - - // Generic binary operation support. - template <typename CustomBinaryOp, typename OtherDerived> EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived> - binaryExpr(const OtherDerived& other, const CustomBinaryOp& func) const { - return TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>(derived(), other, func); - } - - // Coefficient-wise binary operators. - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const Derived, const OtherDerived> - operator+(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_sum_op<Scalar>()); - } - - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const Derived, const OtherDerived> - operator-(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_difference_op<Scalar>()); - } - - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_product_op<Scalar>, const Derived, const OtherDerived> - operator*(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_product_op<Scalar>()); - } - - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived> - operator/(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_quotient_op<Scalar>()); - } - - template<int NaNPropagation=PropagateFast, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar, NaNPropagation>, const Derived, const OtherDerived> - cwiseMax(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_max_op<Scalar,Scalar, NaNPropagation>()); - } - - template<int NaNPropagation=PropagateFast, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar, NaNPropagation>, const Derived, const OtherDerived> - cwiseMin(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_min_op<Scalar,Scalar, NaNPropagation>()); - } - - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived> - operator&&(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_boolean_and_op()); - } - - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived> - operator||(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_boolean_or_op()); - } - - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived> - operator^(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_boolean_xor_op()); - } - - // Comparisons and tests. - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const OtherDerived> - operator<(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>()); - } - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const OtherDerived> - operator<=(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>()); - } - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const OtherDerived> - operator>(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>()); - } - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const OtherDerived> - operator>=(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>()); - } - - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const OtherDerived> - operator==(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>()); - } - - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived> - operator!=(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>()); - } - - // comparisons and tests for Scalars - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > - operator<(Scalar threshold) const { - return operator<(constant(threshold)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > - operator<=(Scalar threshold) const { - return operator<=(constant(threshold)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > - operator>(Scalar threshold) const { - return operator>(constant(threshold)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > - operator>=(Scalar threshold) const { - return operator>=(constant(threshold)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > - operator==(Scalar threshold) const { - return operator==(constant(threshold)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > - operator!=(Scalar threshold) const { - return operator!=(constant(threshold)); - } - - // Checks - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_isnan_op<Scalar>, const Derived> - (isnan)() const { - return unaryExpr(internal::scalar_isnan_op<Scalar>()); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_isinf_op<Scalar>, const Derived> - (isinf)() const { - return unaryExpr(internal::scalar_isinf_op<Scalar>()); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_isfinite_op<Scalar>, const Derived> - (isfinite)() const { - return unaryExpr(internal::scalar_isfinite_op<Scalar>()); - } - - // Coefficient-wise ternary operators. - template<typename ThenDerived, typename ElseDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorSelectOp<const Derived, const ThenDerived, const ElseDerived> - select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const { - return TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>(derived(), thenTensor.derived(), elseTensor.derived()); - } - - // Contractions. - typedef Eigen::IndexPair<Index> DimensionPair; - - template<typename OtherDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const NoOpOutputKernel> - contract(const OtherDerived& other, const Dimensions& dims) const { - return TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const NoOpOutputKernel>(derived(), other.derived(), dims); - } - - template<typename OtherDerived, typename Dimensions, typename OutputKernel> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const OutputKernel> - contract(const OtherDerived& other, const Dimensions& dims, const OutputKernel& output_kernel) const { - return TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const OutputKernel>(derived(), other.derived(), dims, output_kernel); - } - - // Convolutions. - template<typename KernelDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived> - convolve(const KernelDerived& kernel, const Dimensions& dims) const { - return TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>(derived(), kernel.derived(), dims); - } - - // Fourier transforms - template <int FFTDataType, int FFTDirection, typename FFT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection> - fft(const FFT& dims) const { - return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), dims); - } - - // Scan. - typedef TensorScanOp<internal::SumReducer<CoeffReturnType>, const Derived> TensorScanSumOp; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorScanSumOp - cumsum(const Index& axis, bool exclusive = false) const { - return TensorScanSumOp(derived(), axis, exclusive); - } - - typedef TensorScanOp<internal::ProdReducer<CoeffReturnType>, const Derived> TensorScanProdOp; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorScanProdOp - cumprod(const Index& axis, bool exclusive = false) const { - return TensorScanProdOp(derived(), axis, exclusive); - } - - template <typename Reducer> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorScanOp<Reducer, const Derived> - scan(const Index& axis, const Reducer& reducer, bool exclusive = false) const { - return TensorScanOp<Reducer, const Derived>(derived(), axis, exclusive, reducer); - } - - // Reductions. - template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived> - sum(const Dims& dims) const { - return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::SumReducer<CoeffReturnType>()); - } - - const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived> - sum() const { - DimensionList<Index, NumDimensions> in_dims; - return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::SumReducer<CoeffReturnType>()); - } - - template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived> - mean(const Dims& dims) const { - return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MeanReducer<CoeffReturnType>()); - } - - const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived> - mean() const { - DimensionList<Index, NumDimensions> in_dims; - return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MeanReducer<CoeffReturnType>()); - } - - template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived> - prod(const Dims& dims) const { - return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::ProdReducer<CoeffReturnType>()); - } - - const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived> - prod() const { - DimensionList<Index, NumDimensions> in_dims; - return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::ProdReducer<CoeffReturnType>()); - } - - template <typename Dims,int NanPropagation=PropagateFast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived> - maximum(const Dims& dims) const { - return TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>(derived(), dims, internal::MaxReducer<CoeffReturnType,NanPropagation>()); - } - - template <int NanPropagation=PropagateFast> - const TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived> - maximum() const { - DimensionList<Index, NumDimensions> in_dims; - return TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType,NanPropagation>()); - } - - template <typename Dims,int NanPropagation=PropagateFast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived> - minimum(const Dims& dims) const { - return TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>(derived(), dims, internal::MinReducer<CoeffReturnType,NanPropagation>()); - } - - template <int NanPropagation=PropagateFast> - const TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived> - minimum() const { - DimensionList<Index, NumDimensions> in_dims; - return TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType,NanPropagation>()); - } - - template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp<internal::AndReducer, const Dims, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type > - all(const Dims& dims) const { - return cast<bool>().reduce(dims, internal::AndReducer()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp<internal::AndReducer, const DimensionList<Index, NumDimensions>, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type > - all() const { - DimensionList<Index, NumDimensions> in_dims; - return cast<bool>().reduce(in_dims, internal::AndReducer()); - } - - template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp<internal::OrReducer, const Dims, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type > - any(const Dims& dims) const { - return cast<bool>().reduce(dims, internal::OrReducer()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp<internal::OrReducer, const DimensionList<Index, NumDimensions>, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type > - any() const { - DimensionList<Index, NumDimensions> in_dims; - return cast<bool>().reduce(in_dims, internal::OrReducer()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorTupleReducerOp< - internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >, - const array<Index, NumDimensions>, const Derived> - argmax() const { - array<Index, NumDimensions> in_dims; - for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; - return TensorTupleReducerOp< - internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >, - const array<Index, NumDimensions>, - const Derived>(derived(), internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >(), -1, in_dims); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorTupleReducerOp< - internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >, - const array<Index, NumDimensions>, const Derived> - argmin() const { - array<Index, NumDimensions> in_dims; - for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; - return TensorTupleReducerOp< - internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >, - const array<Index, NumDimensions>, - const Derived>(derived(), internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >(), -1, in_dims); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorTupleReducerOp< - internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >, - const array<Index, 1>, const Derived> - argmax(const Index return_dim) const { - array<Index, 1> in_dims; - in_dims[0] = return_dim; - return TensorTupleReducerOp< - internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >, - const array<Index, 1>, - const Derived>(derived(), internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >(), return_dim, in_dims); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorTupleReducerOp< - internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >, - const array<Index, 1>, const Derived> - argmin(const Index return_dim) const { - array<Index, 1> in_dims; - in_dims[0] = return_dim; - return TensorTupleReducerOp< - internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >, - const array<Index, 1>, - const Derived>(derived(), internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >(), return_dim, in_dims); - } - - template <typename Reducer, typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp<Reducer, const Dims, const Derived> - reduce(const Dims& dims, const Reducer& reducer) const { - return TensorReductionOp<Reducer, const Dims, const Derived>(derived(), dims, reducer); - } - - template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorTraceOp<const Dims, const Derived> - trace(const Dims& dims) const { - return TensorTraceOp<const Dims, const Derived>(derived(), dims); - } - - const TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived> - trace() const { - DimensionList<Index, NumDimensions> in_dims; - return TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims); - } - - template <typename Broadcast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorBroadcastingOp<const Broadcast, const Derived> - broadcast(const Broadcast& bcast) const { - return TensorBroadcastingOp<const Broadcast, const Derived>(derived(), bcast); - } - - template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorConcatenationOp<Axis, const Derived, const OtherDerived> - concatenate(const OtherDerived& other, Axis axis) const { - return TensorConcatenationOp<Axis, const Derived, const OtherDerived>(derived(), other.derived(), axis); - } - - template <typename PatchDims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorPatchOp<const PatchDims, const Derived> - extract_patches(const PatchDims& patch_dims) const { - return TensorPatchOp<const PatchDims, const Derived>(derived(), patch_dims); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorImagePatchOp<Dynamic, Dynamic, const Derived> - extract_image_patches(const Index patch_rows = 1, const Index patch_cols = 1, - const Index row_stride = 1, const Index col_stride = 1, - const Index in_row_stride = 1, const Index in_col_stride = 1, - const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const { - return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride, - in_row_stride, in_col_stride, 1, 1, padding_type, padding_value); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorImagePatchOp<Dynamic, Dynamic, const Derived> - extract_image_patches(const Index patch_rows, const Index patch_cols, - const Index row_stride, const Index col_stride, - const Index in_row_stride, const Index in_col_stride, - const Index row_inflate_stride, const Index col_inflate_stride, - const Index padding_top, const Index padding_bottom, - const Index padding_left,const Index padding_right, - const Scalar padding_value) const { - return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride, - in_row_stride, in_col_stride, row_inflate_stride, col_inflate_stride, - padding_top, padding_bottom, padding_left, padding_right, padding_value); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived> - extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols, - const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1, - const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const { - return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value); - } - - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived> - extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols, - const Index plane_stride, const Index row_stride, const Index col_stride, - const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride, - const Index padding_top_z, const Index padding_bottom_z, - const Index padding_top, const Index padding_bottom, - const Index padding_left, const Index padding_right, const Scalar padding_value = Scalar(0)) const { - return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value); - } - - // Morphing operators. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorLayoutSwapOp<const Derived> - swap_layout() const { - return TensorLayoutSwapOp<const Derived>(derived()); - } - template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReshapingOp<const NewDimensions, const Derived> - reshape(const NewDimensions& newDimensions) const { - return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions); - } - template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorSlicingOp<const StartIndices, const Sizes, const Derived> - slice(const StartIndices& startIndices, const Sizes& sizes) const { - return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes); - } - template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived> - stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const { - return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, - const Derived>(derived(), startIndices, stopIndices, strides); - } - template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorChippingOp<DimId, const Derived> - chip(const Index offset) const { - return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorChippingOp<Dynamic, const Derived> - chip(const Index offset, const Index dim) const { - return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim); - } - template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReverseOp<const ReverseDimensions, const Derived> - reverse(const ReverseDimensions& rev) const { - return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev); - } - template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorPaddingOp<const PaddingDimensions, const Derived> - pad(const PaddingDimensions& padding) const { - return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding, internal::scalar_cast_op<int, Scalar>()(0)); - } - template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorPaddingOp<const PaddingDimensions, const Derived> - pad(const PaddingDimensions& padding, const Scalar padding_value) const { - return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding, padding_value); - } - template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorShufflingOp<const Shuffle, const Derived> - shuffle(const Shuffle& shfl) const { - return TensorShufflingOp<const Shuffle, const Derived>(derived(), shfl); - } - template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorStridingOp<const Strides, const Derived> - stride(const Strides& strides) const { - return TensorStridingOp<const Strides, const Derived>(derived(), strides); - } - template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorInflationOp<const Strides, const Derived> - inflate(const Strides& strides) const { - return TensorInflationOp<const Strides, const Derived>(derived(), strides); - } - - // Returns a tensor containing index/value tuples - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorIndexTupleOp<const Derived> - index_tuples() const { - return TensorIndexTupleOp<const Derived>(derived()); - } - - // Support for custom unary and binary operations - template <typename CustomUnaryFunc> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCustomUnaryOp<const CustomUnaryFunc, const Derived> customOp(const CustomUnaryFunc& op) const { - return TensorCustomUnaryOp<const CustomUnaryFunc, const Derived>(derived(), op); - } - template <typename OtherDerived, typename CustomBinaryFunc> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCustomBinaryOp<const CustomBinaryFunc, const Derived, const OtherDerived> customOp(const OtherDerived& other, const CustomBinaryFunc& op) const { - return TensorCustomBinaryOp<const CustomBinaryFunc, const Derived, const OtherDerived>(derived(), other, op); - } - - // Force the evaluation of the expression. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorForcedEvalOp<const Derived> eval() const { - return TensorForcedEvalOp<const Derived>(derived()); - } - - protected: - template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor; - template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize; - // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0 - template <typename OtherDerived, int AccessLevel> friend class Eigen::TensorBase; - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); } -}; - -template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value> -class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> { - public: - typedef TensorBase<Derived, ReadOnlyAccessors> Base; - typedef internal::traits<Derived> DerivedTraits; - typedef typename DerivedTraits::Scalar Scalar; - typedef typename DerivedTraits::Index Index; - typedef Scalar CoeffReturnType; - static const int NumDimensions = DerivedTraits::NumDimensions; - - template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor; - template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize; - // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0 - template <typename OtherDerived, int OtherAccessLevel> friend class Eigen::TensorBase; - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setZero() { - return setConstant(Scalar(0)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) { - return derived() = this->constant(val); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setRandom() { - return derived() = this->random(); - } - template <typename RandomGenerator> EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setRandom() { - return derived() = this->template random<RandomGenerator>(); - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setValues( - const typename internal::Initializer<Derived, NumDimensions>::InitList& vals) { - TensorEvaluator<Derived, DefaultDevice> eval(derived(), DefaultDevice()); - internal::initialize_tensor<Derived, NumDimensions>(eval, vals); - return derived(); - } -#endif // EIGEN_HAS_VARIADIC_TEMPLATES - - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Derived& operator+=(const OtherDerived& other) { - return derived() = derived() + other.derived(); - } - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Derived& operator-=(const OtherDerived& other) { - return derived() = derived() - other.derived(); - } - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Derived& operator*=(const OtherDerived& other) { - return derived() = derived() * other.derived(); - } - template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Derived& operator/=(const OtherDerived& other) { - return derived() = derived() / other.derived(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorLayoutSwapOp<const Derived> - swap_layout() const { - return TensorLayoutSwapOp<const Derived>(derived()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorLayoutSwapOp<Derived> - swap_layout() { - return TensorLayoutSwapOp<Derived>(derived()); - } - - template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorConcatenationOp<const Axis, const Derived, const OtherDerived> - concatenate(const OtherDerived& other, const Axis& axis) const { - return TensorConcatenationOp<const Axis, const Derived, const OtherDerived>(derived(), other, axis); - } - template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorConcatenationOp<const Axis, Derived, OtherDerived> - concatenate(const OtherDerived& other, const Axis& axis) { - return TensorConcatenationOp<const Axis, Derived, OtherDerived>(derived(), other, axis); - } - - template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReshapingOp<const NewDimensions, const Derived> - reshape(const NewDimensions& newDimensions) const { - return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions); - } - template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorReshapingOp<const NewDimensions, Derived> - reshape(const NewDimensions& newDimensions) { - return TensorReshapingOp<const NewDimensions, Derived>(derived(), newDimensions); - } - - template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorSlicingOp<const StartIndices, const Sizes, const Derived> - slice(const StartIndices& startIndices, const Sizes& sizes) const { - return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes); - } - template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorSlicingOp<const StartIndices, const Sizes, Derived> - slice(const StartIndices& startIndices, const Sizes& sizes) { - return TensorSlicingOp<const StartIndices, const Sizes, Derived>(derived(), startIndices, sizes); - } - - template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived> - stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const { - return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, - const Derived>(derived(), startIndices, stopIndices, strides); - } - template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, Derived> - stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) { - return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, - Derived>(derived(), startIndices, stopIndices, strides); - } - - template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorChippingOp<DimId, const Derived> - chip(const Index offset) const { - return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId); - } - template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorChippingOp<DimId, Derived> - chip(const Index offset) { - return TensorChippingOp<DimId, Derived>(derived(), offset, DimId); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorChippingOp<Dynamic, const Derived> - chip(const Index offset, const Index dim) const { - return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorChippingOp<Dynamic, Derived> - chip(const Index offset, const Index dim) { - return TensorChippingOp<Dynamic, Derived>(derived(), offset, dim); - } - - template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReverseOp<const ReverseDimensions, const Derived> - reverse(const ReverseDimensions& rev) const { - return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev); - } - template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorReverseOp<const ReverseDimensions, Derived> - reverse(const ReverseDimensions& rev) { - return TensorReverseOp<const ReverseDimensions, Derived>(derived(), rev); - } - - template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorShufflingOp<const Shuffle, const Derived> - shuffle(const Shuffle& shfl) const { - return TensorShufflingOp<const Shuffle, const Derived>(derived(), shfl); - } - template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorShufflingOp<const Shuffle, Derived> - shuffle(const Shuffle& shfl) { - return TensorShufflingOp<const Shuffle, Derived>(derived(), shfl); - } - - template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorStridingOp<const Strides, const Derived> - stride(const Strides& strides) const { - return TensorStridingOp<const Strides, const Derived>(derived(), strides); - } - template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorStridingOp<const Strides, Derived> - stride(const Strides& strides) { - return TensorStridingOp<const Strides, Derived>(derived(), strides); - } - - // Select the device on which to evaluate the expression. - template <typename DeviceType> - TensorDevice<Derived, DeviceType> device(const DeviceType& dev) { - return TensorDevice<Derived, DeviceType>(dev, derived()); - } - - // Select the async device on which to evaluate the expression. - template <typename DeviceType, typename DoneCallback> - TensorAsyncDevice<Derived, DeviceType, DoneCallback> device(const DeviceType& dev, DoneCallback done) { - return TensorAsyncDevice<Derived, DeviceType, DoneCallback>(dev, derived(), std::move(done)); - } - - protected: - EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TensorBase) - EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorBase) - - template<typename OtherDerived> EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other) - { - typedef TensorAssignOp<Derived, const OtherDerived> Assign; - Assign assign(derived(), other.derived()); - internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); - return derived(); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); } -}; -#endif // EIGEN_PARSED_BY_DOXYGEN -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorBlock.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorBlock.h deleted file mode 100644 index 1e55d12..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorBlock.h +++ /dev/null @@ -1,1559 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H -#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H - -namespace Eigen { -namespace internal { - -// -------------------------------------------------------------------------- // -// Forward declarations for templates defined below. -template <typename Scalar, typename IndexType, int NumDims, int Layout> -class TensorBlockIO; - -// -------------------------------------------------------------------------- // -// Helper function to compute strides for densely stored buffer of given -// dimensions. - -// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use -// this function instead everywhere. -template <int Layout, typename IndexType, int NumDims> -EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides( - const DSizes<IndexType, NumDims>& dimensions) { - DSizes<IndexType, NumDims> strides; - if (NumDims == 0) return strides; - - // TODO(ezhulenev): Use templates to unroll this loop (similar to - // h_array_reduce in CXX11meta.h)? Benchmark it. - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - strides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - strides[i] = strides[i - 1] * dimensions[i - 1]; - } - } else { - strides[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * dimensions[i + 1]; - } - } - - return strides; -} - -template <int Layout, typename IndexType, size_t NumDims> -EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides( - const Eigen::array<IndexType, NumDims>& dimensions) { - return strides<Layout>(DSizes<IndexType, NumDims>(dimensions)); -} - -template <int Layout, std::ptrdiff_t... Indices> -EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides( - const Sizes<Indices...>& sizes) { - return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes)); -} - -// -------------------------------------------------------------------------- // - -// Tensor block shape type defines what are the shape preference for the blocks -// extracted from the larger tensor. -// -// Example: blocks of 100 elements from the large 100x100 tensor: -// - tensor: 100x100 -// - target_block_size: 100 -// -// TensorBlockShapeType: -// - kUniformAllDims: 100 blocks of size 10x10 -// - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column -// or row major layout) -enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims }; - -struct TensorBlockResourceRequirements { - TensorBlockShapeType shape_type; // target block shape - size_t size; // target block size - TensorOpCost cost_per_coeff; // cost of computing a single block element - -#ifdef EIGEN_HIPCC - // For HIPCC, we need to explicitly declare as a "device fun", the constructor - // which is implicitly invoked in the "merge" / "any" routines. else HIPCC - // errors out complaining about the lack of a matching constructor - EIGEN_DEVICE_FUNC - TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_, - TensorOpCost cost_) - : shape_type(shape_type_), size(size_), cost_per_coeff(cost_) - {} -#endif - - template <typename Scalar> - EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize( - TensorBlockShapeType shape_type, size_t size_in_bytes, - TensorOpCost cost) { - const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar)); - return {shape_type, size, cost}; - } - - template <typename Scalar> - EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize( - TensorBlockShapeType shape_type, size_t size_in_bytes) { - // This default cost per coefficient is valid for most materialized tensor - // block evaluation implementations, because they typically just read - // coefficients from the underlying tensor storage, and write to the tensor - // block buffer (scratch or destination memory, reads and writes have linear - // access pattern). We ignore the fixed cost of block evaluation, because in - // practice it should negligible. - // - // Lazy block evaluation adds the cost of calling a functor for each - // coefficient. - // - // All non-trivial block evaluation implementations must provide their own - // cost approximation (e.g. shuffling inner dimension has a much higher cost - // because it reads memory randomly, although the total number of moved - // bytes is the same). - return withShapeAndSize<Scalar>(shape_type, size_in_bytes, - {/*bytes_loaded=*/sizeof(Scalar), - /*bytes_stored=*/sizeof(Scalar), - /*compute_cycles=*/0}); - } - - template <typename Scalar> - EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed( - size_t size_in_bytes) { - return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims, - size_in_bytes); - } - - template <typename Scalar> - EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform( - size_t size_in_bytes) { - return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims, - size_in_bytes); - } - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE TensorBlockResourceRequirements - merge(const TensorBlockResourceRequirements& lhs, - const TensorBlockResourceRequirements& rhs) { - return {merge(lhs.shape_type, rhs.shape_type), // shape_type - merge(lhs.size, rhs.size), // size - merge(lhs.cost_per_coeff, rhs.cost_per_coeff)}; // cost_per_coeff - } - - EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff( - TensorOpCost cost) { - cost_per_coeff += cost; - return *this; - } - - // This is a resource requirement that should be returned from expressions - // that do not have any block evaluation preference (e.g. default tensor - // expression with raw buffer access). - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() { - return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}}; - } - - private: - using Requirements = TensorBlockResourceRequirements; - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) { - return numext::maxi(lhs_size, rhs_size); - } - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE TensorBlockShapeType - merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) { - return (lhs == TensorBlockShapeType::kSkewedInnerDims || - rhs == TensorBlockShapeType::kSkewedInnerDims) - ? TensorBlockShapeType::kSkewedInnerDims - : TensorBlockShapeType::kUniformAllDims; - } - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost, - TensorOpCost rhs_cost) { - return lhs_cost + rhs_cost; - } -}; - -// -------------------------------------------------------------------------- // -// TensorBlockDescriptor specifies a block offset within a tensor and the block -// sizes along each of the tensor dimensions. - -template <int NumDims, typename IndexType = Eigen::Index> -class TensorBlockDescriptor { - public: - typedef DSizes<IndexType, NumDims> Dimensions; - - // If we evaluate a Tensor assignment, and expression on the left, already has - // a memory buffer, then we might do performance optimization, and evaluate - // the root expression directly into the final output memory. Some time it's - // possible to reuse it for materializing subexpressions inside an expression - // tree, to to avoid dynamic memory allocation. - // - // The pointer type of the underlying storage is erased, because passing - // Scalar type through all the expression evaluation layers is way too many - // templates. In practice destination buffer type should always match the - // evaluated expression scalar type. - class DestinationBuffer { - public: - enum DestinationBufferKind : int { - // The above explicit specification of "int" as the enum basetype is - // needed to get around a HIPCC link error ("the field type is not - // amp-compatible") - // which is issued for class members with the enum type. - // TODO(rocm): - // remove the "int" basetype once HIPCC has been fixed to not error out - // in the above scenario. - - // Destination buffer is not defined (`m_data` == nullptr). - kEmpty, - - // Tensor block defined by an owning tensor block descriptor can fit - // contiguously into the destination buffer. In this case it's safe to - // materialize tensor block in the destination buffer, wrap it in a - // TensorMap, and use to build Eigen expression on top of it. - kContiguous, - - // Destination buffer strides do not match strides of the contiguously - // stored block, and it's impossible to define a TensorMap over this - // buffer. However if we are evaluating a root of an expression tree, we - // still can materialize an output into this destination, because we can - // guarantee that no one will ever access it through block API. - // - // In theory it is possible to build valid TensorStriding<TensorMap> - // expression on top of this destination buffer, however it has - // inefficient coeff/packet access, and defeats the purpose of fast block - // evaluation API. - kStrided - }; - - template <typename Scalar> - Scalar* data() const { - eigen_assert(m_data_type_size == sizeof(Scalar)); - return static_cast<Scalar*>(m_data); - } - - const Dimensions& strides() const { return m_strides; } - const DestinationBufferKind& kind() const { return m_kind; } - - private: - friend class TensorBlockDescriptor; - - DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {} - - template <typename Scalar> - DestinationBuffer(Scalar* data, const Dimensions& strides, - DestinationBufferKind kind) - : m_data(static_cast<void*>(data)), - m_data_type_size(sizeof(Scalar)), - m_strides(strides), - m_kind(kind) {} - - template <int Layout, typename Scalar> - static DestinationBuffer make(const TensorBlockDescriptor& desc, - Scalar* data, const Dimensions& strides) { - return DestinationBuffer(data, strides, kind<Layout>(desc, strides)); - } - - template <int Layout> - static DestinationBufferKind kind(const TensorBlockDescriptor& desc, - const Dimensions& strides) { - const Dimensions& desc_dims = desc.dimensions(); - const Dimensions& desc_strides = internal::strides<Layout>(desc_dims); - for (int i = 0; i < NumDims; ++i) { - if (desc_dims[i] == 1) continue; - if (desc_strides[i] != strides[i]) return kStrided; - } - return kContiguous; - } - - // Storage pointer is type erased, to reduce template bloat, but we still - // keep the size of the underlying element type for error checking. - void* m_data; - size_t m_data_type_size; - - // Destination buffer dimensions always match the dimensions of a tensor - // block descriptor it belongs to, however strides might be different. - Dimensions m_strides; - - DestinationBufferKind m_kind; - }; - - TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions, - const DestinationBuffer& destination) - : m_offset(offset), - m_dimensions(dimensions), - m_destination(destination) {} - - TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions) - : m_offset(offset), - m_dimensions(dimensions), - m_destination(DestinationBuffer()) {} - - IndexType offset() const { return m_offset; } - const Dimensions& dimensions() const { return m_dimensions; } - IndexType dimension(int index) const { return m_dimensions[index]; } - IndexType size() const { return array_prod<IndexType>(m_dimensions); } - - const DestinationBuffer& destination() const { return m_destination; } - - template <int Layout, typename Scalar> - void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) { - eigen_assert(dst_base != NULL); - m_destination = - DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides); - } - - template <int Layout, typename Scalar, typename DstStridesIndexType> - void AddDestinationBuffer( - Scalar* dst_base, - const DSizes<DstStridesIndexType, NumDims>& dst_strides) { - // DSizes constructor will do index type promotion if it's safe. - AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides)); - } - - TensorBlockDescriptor& DropDestinationBuffer() { - m_destination.m_data = NULL; - m_destination.m_kind = DestinationBuffer::kEmpty; - return *this; - } - - bool HasDestinationBuffer() const { - return m_destination.kind() != DestinationBuffer::kEmpty; - } - - // Returns a copy of `*this` with updated offset. - TensorBlockDescriptor WithOffset(IndexType offset) const { - return TensorBlockDescriptor(offset, m_dimensions, m_destination); - } - - private: - // Offset and dimensions are immutable after construction. Block descriptor - // can only be mutated by adding or dropping destination. - const IndexType m_offset; - const Dimensions m_dimensions; - DestinationBuffer m_destination; -}; - -// -------------------------------------------------------------------------- // -// TensorBlockMapper is responsible for iterating over the blocks of a tensor. - -template <int NumDims, int Layout, typename IndexType = Eigen::Index> -class TensorBlockMapper { - typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor; - - public: - typedef DSizes<IndexType, NumDims> Dimensions; - - TensorBlockMapper() = default; - TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions, - const TensorBlockResourceRequirements& requirements) - : m_tensor_dimensions(dimensions), m_requirements(requirements) { - // Compute block dimensions and the total number of blocks. - InitializeBlockDimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const { - return m_total_block_count; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const { - return m_block_dimensions.TotalSize(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>& - blockDimensions() const { - return m_block_dimensions; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor - blockDescriptor(IndexType block_index) const { - static const bool isColMajor = Layout == static_cast<int>(ColMajor); - - IndexType offset = 0; - DSizes<IndexType, NumDims> dimensions; - - if (NumDims == 0) return BlockDescriptor(offset, dimensions); - - // Iterate outer -> inner dimensions. - for (int i = NumDims - 1; i >= 0; --i) { - const int dim = isColMajor ? i : NumDims - i - 1; - - const IndexType idx = block_index / m_block_strides[dim]; - block_index -= idx * m_block_strides[dim]; - - const IndexType coord = idx * m_block_dimensions[dim]; - dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord, - m_block_dimensions[dim]); - offset += coord * m_tensor_strides[dim]; - } - - return {offset, dimensions}; - } - - private: - void InitializeBlockDimensions() { - // Requested block shape and size. - const TensorBlockShapeType shape_type = m_requirements.shape_type; - IndexType target_block_size = - numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size)); - - IndexType tensor_size = m_tensor_dimensions.TotalSize(); - - // Corner case: one of the dimensions is zero. Logic below is too complex - // to handle this case on a general basis, just use unit block size. - // Note: we must not yield blocks with zero dimensions (recipe for - // overflows/underflows, divisions by zero and NaNs later). - if (tensor_size == 0) { - for (int i = 0; i < NumDims; ++i) { - m_block_dimensions[i] = 1; - } - m_total_block_count = 0; - return; - } - - // If tensor fits into a target block size, evaluate it as a single block. - if (tensor_size <= target_block_size) { - m_block_dimensions = m_tensor_dimensions; - m_total_block_count = 1; - // The only valid block index is `0`, and in this case we do not need - // to compute real strides for tensor or blocks (see blockDescriptor). - for (int i = 0; i < NumDims; ++i) { - m_tensor_strides[i] = 0; - m_block_strides[i] = 1; - } - return; - } - - static const bool isColMajor = Layout == static_cast<int>(ColMajor); - - // Block shape skewed towards inner dimension. - if (shape_type == TensorBlockShapeType::kSkewedInnerDims) { - IndexType coeff_to_allocate = target_block_size; - - for (int i = 0; i < NumDims; ++i) { - const int dim = isColMajor ? i : NumDims - i - 1; - m_block_dimensions[dim] = - numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]); - coeff_to_allocate = divup( - coeff_to_allocate, - numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim])); - } - eigen_assert(coeff_to_allocate == 1); - - } else if (shape_type == TensorBlockShapeType::kUniformAllDims) { - // Tensor will not fit within 'target_block_size' budget: calculate tensor - // block dimension sizes based on "square" dimension size target. - const IndexType dim_size_target = convert_index<IndexType>( - std::pow(static_cast<float>(target_block_size), - 1.0f / static_cast<float>(m_block_dimensions.rank()))); - - for (int i = 0; i < NumDims; ++i) { - // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it - // a multiple of the packet size. Note that reducing - // 'block_dim_size' in this manner can increase the number of - // blocks, and so will amplify any per-block overhead. - m_block_dimensions[i] = - numext::mini(dim_size_target, m_tensor_dimensions[i]); - } - - // Add any un-allocated coefficients to inner dimension(s). - IndexType total_size = m_block_dimensions.TotalSize(); - for (int i = 0; i < NumDims; ++i) { - const int dim = isColMajor ? i : NumDims - i - 1; - - if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) { - const IndexType total_size_other_dims = - total_size / m_block_dimensions[dim]; - const IndexType alloc_avail = - divup<IndexType>(target_block_size, total_size_other_dims); - if (alloc_avail == m_block_dimensions[dim]) { - // Insufficient excess coefficients to allocate. - break; - } - m_block_dimensions[dim] = - numext::mini(m_tensor_dimensions[dim], alloc_avail); - total_size = total_size_other_dims * m_block_dimensions[dim]; - } - } - - } else { - eigen_assert(false); // unknown block shape - } - - eigen_assert(m_block_dimensions.TotalSize() >= - numext::mini<IndexType>(target_block_size, - m_tensor_dimensions.TotalSize())); - - // Calculate block counts by dimension and total block count. - DSizes<IndexType, NumDims> block_count; - for (int i = 0; i < NumDims; ++i) { - block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]); - } - m_total_block_count = array_prod(block_count); - - // Calculate block strides (used for enumerating blocks). - m_tensor_strides = strides<Layout>(m_tensor_dimensions); - m_block_strides = strides<Layout>(block_count); - } - - DSizes<IndexType, NumDims> m_tensor_dimensions; - TensorBlockResourceRequirements m_requirements; - - DSizes<IndexType, NumDims> m_block_dimensions; - IndexType m_total_block_count; - - DSizes<IndexType, NumDims> m_tensor_strides; - DSizes<IndexType, NumDims> m_block_strides; -}; - -// -------------------------------------------------------------------------- // -// TensorBlockScratchAllocator is responsible for allocating temporary buffers -// for block evaluation (output or input block materialization). Given that -// Eigen expression traversal order is deterministic, all temporary allocations -// are happening in the same order, and usually have exactly the same size. -// Scratch allocator keeps a trace of all dynamic allocations, and after the -// first block evaluation is completed, we should be able to reuse all the -// temporary buffers for the next block evaluation. - -template <typename Device> -class TensorBlockScratchAllocator { - public: - explicit TensorBlockScratchAllocator(const Device& device) - : m_device(device), m_allocation_index(0) {} - - ~TensorBlockScratchAllocator() { - for (size_t i = 0; i < m_allocations.size(); ++i) { - m_device.deallocate(m_allocations[i].ptr); - } - } - - void* allocate(size_t size) { - // TODO(ezhulenev): Remove when replaced with inlined vector. - if (m_allocations.capacity() == 0) m_allocations.reserve(8); - - // Check if we already have an existing allocation att current index. - const int num_allocations = static_cast<int>(m_allocations.size()); - const bool has_allocation = m_allocation_index < num_allocations; - - // Allocation index can't be larger than the number of allocations. - eigen_assert(m_allocation_index <= num_allocations); - - // If we have existing allocation, and its size is larger or equal to - // requested size, we do nothing. - - // If current allocation can't fit requested size, we deallocate it, and - // replace with a larger allocation. - if (has_allocation && m_allocations[m_allocation_index].size < size) { - m_device.deallocate(m_allocations[m_allocation_index].ptr); - m_allocations[m_allocation_index].ptr = m_device.allocate(size); - m_allocations[m_allocation_index].size = size; - } - - // Make a new allocation if we don't have and existing one. - if (!has_allocation) { - Allocation allocation; - allocation.ptr = m_device.allocate(size); - allocation.size = size; - m_allocations.push_back(allocation); - } - - eigen_assert(m_allocations[m_allocation_index].ptr != NULL); - eigen_assert(m_allocations[m_allocation_index].size >= size); - - return m_allocations[m_allocation_index++].ptr; - } - - void reset() { m_allocation_index = 0; } - - private: - struct Allocation { - void* ptr; - size_t size; - }; - - const Device& m_device; - int m_allocation_index; - // TODO(ezhulenev): This should be an inlined vector. - std::vector<Allocation> m_allocations; -}; - -// -------------------------------------------------------------------------- // -// TensorBlockKind represents all possible block kinds, that can be produced by -// TensorEvaluator::evalBlock function. -enum TensorBlockKind { - // Tensor block that is a lazy expression that must be assigned to a - // destination using TensorBlockAssign. - kExpr, - - // Tensor block that is a view into a memory buffer owned by an underlying - // Tensor expression (e.g. it can be a view into a Tensor buffer). - kView, - - // Tensor block that was materialized in a scratch memory buffer, allocated - // with TensorBlockScratchAllocator. This block must be copied to a - // destination, similar to a block of `kExpr` type. - kMaterializedInScratch, - - // Tensor block that was materialized directly into the final output memory - // buffer. For example if the left side of an assignment is a Tensor, we can - // directly materialize the block in the destination memory. - // - // If strides in the output buffer do not match tensor block strides, the - // Tensor expression will be invalid, and should not be used by - // TensorBlockAssign or for constructing another block expression. - kMaterializedInOutput -}; - -// -------------------------------------------------------------------------- // -// TensorBlockNotImplemented should be used to defined TensorBlock typedef in -// TensorEvaluators that do not support block evaluation. - -class TensorBlockNotImplemented { - public: - typedef void XprType; -}; - -// -------------------------------------------------------------------------- // -// XprScalar extracts Scalar type from the Eigen expressions (if expression type -// is not void). It's required to be able to define lazy block expression for -// argument types, that do not support block evaluation. - -template <typename XprType> -struct XprScalar { - typedef typename XprType::Scalar type; -}; -template <> -struct XprScalar<void> { - typedef void type; -}; - -// -------------------------------------------------------------------------- // -// TensorMaterializedBlock is a fully evaluated block of the original tensor, -// and XprType is just a TensorMap over the data. This block type is typically -// used to materialize blocks of tensor expressions, that can't be efficiently -// represented as lazy Tensor expressions with fast coeff/packet operations, -// e.g. we materialize all broadcasts into evaluated blocks. -// -// TensorMaterializedBlock does not own its memory buffer, it's either a memory -// buffer that backs the original expression (e.g. block is just a view into a -// Tensor), or a memory buffer allocated with scratch allocator, and in this -// case the scratch allocator will deallocate it at the end of block based -// expression execution. -// -// If the block was evaluated directly into the output buffer, and strides in -// the output buffer do not match block strides, the TensorMap expression will -// be invalid, and should never be used in block assignment or any other tensor -// expression. - -template <typename Scalar, int NumDims, int Layout, - typename IndexType = Eigen::Index> -class TensorMaterializedBlock { - public: - typedef DSizes<IndexType, NumDims> Dimensions; - typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType; - - TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, - const Dimensions& dimensions, bool valid_expr = true) - : m_kind(kind), - m_data(data), - m_dimensions(dimensions), - m_expr(m_data, m_dimensions), - m_valid_expr(valid_expr) { - eigen_assert(m_kind == internal::TensorBlockKind::kView || - m_kind == internal::TensorBlockKind::kMaterializedInScratch || - m_kind == internal::TensorBlockKind::kMaterializedInOutput); - } - - TensorBlockKind kind() const { return m_kind; } - // NOTE(ezhulenev): Returning XprType by value like in other block types - // causes asan failures. The theory is that XprType::Nested doesn't work - // properly for TensorMap. - const XprType& expr() const { - eigen_assert(m_valid_expr); - return m_expr; - } - const Scalar* data() const { return m_data; } - void cleanup() {} - - typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc; - - // TensorMaterializedBlock can be backed by different types of storage: - // - // (1) Contiguous block of memory allocated with scratch allocator. - // (2) Contiguous block of memory reused from tensor block descriptor - // destination buffer. - // (3) Strided block of memory reused from tensor block descriptor - // destination buffer. - // - class Storage { - public: - Scalar* data() const { return m_data; } - const Dimensions& dimensions() const { return m_dimensions; } - const Dimensions& strides() const { return m_strides; } - - TensorMaterializedBlock AsTensorMaterializedBlock() const { - return TensorMaterializedBlock( - m_materialized_in_output - ? internal::TensorBlockKind::kMaterializedInOutput - : internal::TensorBlockKind::kMaterializedInScratch, - m_data, m_dimensions, !m_strided_storage); - } - - private: - friend class TensorMaterializedBlock; - - Storage(Scalar* data, const Dimensions& dimensions, - const Dimensions& strides, bool materialized_in_output, - bool strided_storage) - : m_data(data), - m_dimensions(dimensions), - m_strides(strides), - m_materialized_in_output(materialized_in_output), - m_strided_storage(strided_storage) {} - - Scalar* m_data; - Dimensions m_dimensions; - Dimensions m_strides; - bool m_materialized_in_output; - bool m_strided_storage; - }; - - // Creates a storage for materialized block either from the block descriptor - // destination buffer, or allocates a new buffer with scratch allocator. - template <typename TensorBlockScratch> - EIGEN_STRONG_INLINE static Storage prepareStorage( - TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool allow_strided_storage = false) { - // Try to reuse destination as an output block buffer. - typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer; - - if (desc.destination().kind() == DestinationBuffer::kContiguous) { - Scalar* buffer = desc.destination().template data<Scalar>(); - desc.DropDestinationBuffer(); - return Storage(buffer, desc.dimensions(), - internal::strides<Layout>(desc.dimensions()), - /*materialized_in_output=*/true, - /*strided_storage=*/false); - - } else if (desc.destination().kind() == DestinationBuffer::kStrided && - allow_strided_storage) { - Scalar* buffer = desc.destination().template data<Scalar>(); - desc.DropDestinationBuffer(); - return Storage(buffer, desc.dimensions(), desc.destination().strides(), - /*materialized_in_output=*/true, /*strided_storage=*/true); - - } else { - void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); - return Storage(static_cast<Scalar*>(mem), desc.dimensions(), - internal::strides<Layout>(desc.dimensions()), - /*materialized_in_output=*/false, - /*strided_storage=*/false); - } - } - - // Creates a materialized block for the given descriptor from a memory buffer. - template <typename DataDimensions, typename TensorBlockScratch> - EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize( - const Scalar* data, const DataDimensions& data_dims, - TensorBlockDesc& desc, TensorBlockScratch& scratch) { - eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size()); - - // If a tensor block dimensions covers a contiguous block of the underlying - // memory, we can skip block buffer memory allocation, and construct a block - // from existing `data` memory buffer. - // - // Example: (RowMajor layout) - // data_dims: [11, 12, 13, 14] - // desc.dimensions(): [1, 1, 3, 14] - // - // In this case we can construct a TensorBlock starting at - // `data + desc.offset()`, with a `desc.dimensions()` block sizes. - static const bool is_col_major = Layout == ColMajor; - - // Find out how many inner dimensions have a matching size. - int num_matching_inner_dims = 0; - for (int i = 0; i < NumDims; ++i) { - int dim = is_col_major ? i : NumDims - i - 1; - if (data_dims[dim] != desc.dimensions()[dim]) break; - ++num_matching_inner_dims; - } - - // All the outer dimensions must be of size `1`, except a single dimension - // before the matching inner dimension (`3` in the example above). - bool can_use_direct_access = true; - for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) { - int dim = is_col_major ? i : NumDims - i - 1; - if (desc.dimension(dim) != 1) { - can_use_direct_access = false; - break; - } - } - - if (can_use_direct_access) { - const Scalar* block_start = data + desc.offset(); - return TensorMaterializedBlock(internal::TensorBlockKind::kView, - block_start, desc.dimensions()); - - } else { - // Reuse destination buffer or allocate new buffer with scratch allocator. - const Storage storage = prepareStorage(desc, scratch); - - typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout> - TensorBlockIO; - typedef typename TensorBlockIO::Dst TensorBlockIODst; - typedef typename TensorBlockIO::Src TensorBlockIOSrc; - - TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)), - data, desc.offset()); - TensorBlockIODst dst(storage.dimensions(), storage.strides(), - storage.data()); - - TensorBlockIO::Copy(dst, src); - return storage.AsTensorMaterializedBlock(); - } - } - - private: - TensorBlockKind m_kind; - const Scalar* m_data; - Dimensions m_dimensions; - XprType m_expr; - bool m_valid_expr; -}; - -// -------------------------------------------------------------------------- // -// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp -// functor to the blocks produced by the underlying Tensor expression. - -template <typename UnaryOp, typename ArgTensorBlock> -class TensorCwiseUnaryBlock { - static const bool NoArgBlockAccess = - internal::is_void<typename ArgTensorBlock::XprType>::value; - - public: - typedef typename conditional< - NoArgBlockAccess, void, - TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >:: - type XprType; - - typedef typename XprScalar<XprType>::type Scalar; - - TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor) - : m_arg_block(arg_block), m_functor(functor) {} - - TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } - - XprType expr() const { return XprType(m_arg_block.expr(), m_functor); } - const Scalar* data() const { return NULL; } - void cleanup() { m_arg_block.cleanup(); } - - private: - ArgTensorBlock m_arg_block; - UnaryOp m_functor; -}; - -// -------------------------------------------------------------------------- // -// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp -// functor to the blocks produced by the underlying Tensor expression. - -template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock> -class TensorCwiseBinaryBlock { - static const bool NoArgBlockAccess = - internal::is_void<typename LhsTensorBlock::XprType>::value || - internal::is_void<typename RhsTensorBlock::XprType>::value; - - public: - typedef typename conditional< - NoArgBlockAccess, void, - TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType, - const typename RhsTensorBlock::XprType> >::type - XprType; - - typedef typename XprScalar<XprType>::type Scalar; - - TensorCwiseBinaryBlock(const LhsTensorBlock& left_block, - const RhsTensorBlock& right_block, - const BinaryOp& functor) - : m_left_block(left_block), - m_right_block(right_block), - m_functor(functor) {} - - TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } - - XprType expr() const { - return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); - } - - const Scalar* data() const { return NULL; } - - void cleanup() { - m_left_block.cleanup(); - m_right_block.cleanup(); - } - - private: - LhsTensorBlock m_left_block; - RhsTensorBlock m_right_block; - BinaryOp m_functor; -}; - -// -------------------------------------------------------------------------- // -// TensorUnaryExprBlock is a lazy tensor expression block that can construct -// an arbitrary tensor expression from a block of the underlying type (this is a -// generalization of the TensorCwiseUnaryBlock for arbitrary expressions). - -template <typename BlockFactory, typename ArgTensorBlock> -class TensorUnaryExprBlock { - typedef typename ArgTensorBlock::XprType ArgXprType; - static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value; - - public: - typedef typename conditional< - NoArgBlockAccess, void, - typename BlockFactory::template XprType<ArgXprType>::type>::type XprType; - - typedef typename XprScalar<XprType>::type Scalar; - - TensorUnaryExprBlock(const ArgTensorBlock& arg_block, - const BlockFactory& factory) - : m_arg_block(arg_block), m_factory(factory) {} - - TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } - XprType expr() const { return m_factory.expr(m_arg_block.expr()); } - const Scalar* data() const { return NULL; } - void cleanup() { m_arg_block.cleanup(); } - - private: - ArgTensorBlock m_arg_block; - BlockFactory m_factory; -}; - -// -------------------------------------------------------------------------- // -// TensorTernaryExprBlock is a lazy tensor expression block that can construct -// an arbitrary tensor expression from three blocks of the underlying type. - -template <typename BlockFactory, typename Arg1TensorBlock, - typename Arg2TensorBlock, typename Arg3TensorBlock> -class TensorTernaryExprBlock { - typedef typename Arg1TensorBlock::XprType Arg1XprType; - typedef typename Arg2TensorBlock::XprType Arg2XprType; - typedef typename Arg3TensorBlock::XprType Arg3XprType; - - static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value || - internal::is_void<Arg2XprType>::value || - internal::is_void<Arg3XprType>::value; - - public: - typedef typename conditional< - NoArgBlockAccess, void, - typename BlockFactory::template XprType<Arg1XprType, Arg2XprType, - Arg3XprType>::type>::type XprType; - - typedef typename XprScalar<XprType>::type Scalar; - - TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, - const Arg2TensorBlock& arg2_block, - const Arg3TensorBlock& arg3_block, - const BlockFactory& factory) - : m_arg1_block(arg1_block), - m_arg2_block(arg2_block), - m_arg3_block(arg3_block), - m_factory(factory) {} - - TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } - XprType expr() const { - return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), - m_arg3_block.expr()); - } - const Scalar* data() const { return NULL; } - void cleanup() { - m_arg1_block.cleanup(); - m_arg2_block.cleanup(); - m_arg3_block.cleanup(); - } - - private: - Arg1TensorBlock m_arg1_block; - Arg2TensorBlock m_arg2_block; - Arg3TensorBlock m_arg3_block; - BlockFactory m_factory; -}; - -// -------------------------------------------------------------------------- // -// StridedLinearBufferCopy provides a method to copy data between two linear -// buffers with different strides, with optimized paths for scatter/gather. - -template <typename Scalar, typename IndexType> -class StridedLinearBufferCopy { - typedef typename packet_traits<Scalar>::type Packet; - enum { - Vectorizable = packet_traits<Scalar>::Vectorizable, - PacketSize = packet_traits<Scalar>::size - }; - - public: - // Specifying linear copy kind statically gives ~30% speedup for small sizes. - enum class Kind { - Linear = 0, // src_stride == 1 && dst_stride == 1 - Scatter = 1, // src_stride == 1 && dst_stride != 1 - FillLinear = 2, // src_stride == 0 && dst_stride == 1 - FillScatter = 3, // src_stride == 0 && dst_stride != 1 - Gather = 4, // dst_stride == 1 - Random = 5 // everything else - }; - - struct Dst { - Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {} - - IndexType offset; - IndexType stride; - Scalar* data; - }; - - struct Src { - Src(IndexType o, IndexType s, const Scalar* d) - : offset(o), stride(s), data(d) {} - - IndexType offset; - IndexType stride; - const Scalar* data; - }; - - template <typename StridedLinearBufferCopy::Kind kind> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst, - const Src& src, - const size_t count) { - Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, - src.data); - } - - private: - template <typename StridedLinearBufferCopy::Kind kind> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const IndexType count, const IndexType dst_offset, - const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data, - const IndexType src_offset, const IndexType src_stride, - const Scalar* EIGEN_RESTRICT src_data) { - const Scalar* src = &src_data[src_offset]; - Scalar* dst = &dst_data[dst_offset]; - - if (!Vectorizable) { - for (Index i = 0; i < count; ++i) { - dst[i * dst_stride] = src[i * src_stride]; - } - return; - } - - const IndexType vectorized_size = count - PacketSize; - IndexType i = 0; - - if (kind == StridedLinearBufferCopy::Kind::Linear) { - // ******************************************************************** // - // Linear copy from `src` to `dst`. - const IndexType unrolled_size = count - 4 * PacketSize; - eigen_assert(src_stride == 1 && dst_stride == 1); - for (; i <= unrolled_size; i += 4 * PacketSize) { - for (int j = 0; j < 4; ++j) { - Packet p = ploadu<Packet>(src + i + j * PacketSize); - pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p); - } - } - for (; i <= vectorized_size; i += PacketSize) { - Packet p = ploadu<Packet>(src + i); - pstoreu<Scalar, Packet>(dst + i, p); - } - for (; i < count; ++i) { - dst[i] = src[i]; - } - // ******************************************************************** // - } else if (kind == StridedLinearBufferCopy::Kind::Scatter) { - // Scatter from `src` to `dst`. - eigen_assert(src_stride == 1 && dst_stride != 1); - for (; i <= vectorized_size; i += PacketSize) { - Packet p = ploadu<Packet>(src + i); - pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride); - } - for (; i < count; ++i) { - dst[i * dst_stride] = src[i]; - } - // ******************************************************************** // - } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) { - // Fill `dst` with value at `*src`. - eigen_assert(src_stride == 0 && dst_stride == 1); - const IndexType unrolled_size = count - 4 * PacketSize; - Packet p = pload1<Packet>(src); - for (; i <= unrolled_size; i += 4 * PacketSize) { - for (int j = 0; j < 4; ++j) { - pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p); - } - } - for (; i <= vectorized_size; i += PacketSize) { - pstoreu<Scalar, Packet>(dst + i, p); - } - for (; i < count; ++i) { - dst[i] = *src; - } - // ******************************************************************** // - } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) { - // Scatter `*src` into `dst`. - eigen_assert(src_stride == 0 && dst_stride != 1); - Packet p = pload1<Packet>(src); - for (; i <= vectorized_size; i += PacketSize) { - pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride); - } - for (; i < count; ++i) { - dst[i * dst_stride] = *src; - } - // ******************************************************************** // - } else if (kind == StridedLinearBufferCopy::Kind::Gather) { - // Gather from `src` into `dst`. - eigen_assert(dst_stride == 1); - for (; i <= vectorized_size; i += PacketSize) { - Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride); - pstoreu<Scalar, Packet>(dst + i, p); - } - for (; i < count; ++i) { - dst[i] = src[i * src_stride]; - } - // ******************************************************************** // - } else if (kind == StridedLinearBufferCopy::Kind::Random) { - // Random. - for (; i < count; ++i) { - dst[i * dst_stride] = src[i * src_stride]; - } - } else { - eigen_assert(false); - } - } -}; - -// -------------------------------------------------------------------------- // -// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block. -// It's possible to specify src->dst dimension mapping for the copy operation. -// Dimensions of `dst` specify how many elements have to be copied, for the -// `src` we need to know only stride to navigate through source memory buffer. - -template <typename Scalar, typename IndexType, int NumDims, int Layout> -class TensorBlockIO { - static const bool IsColMajor = (Layout == ColMajor); - - typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy; - - public: - typedef DSizes<IndexType, NumDims> Dimensions; - typedef DSizes<int, NumDims> DimensionsMap; - - struct Dst { - Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, - IndexType dst_offset = 0) - : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {} - - Dimensions dims; - Dimensions strides; - Scalar* data; - IndexType offset; - }; - - struct Src { - Src(const Dimensions& src_strides, const Scalar* src, - IndexType src_offset = 0) - : strides(src_strides), data(src), offset(src_offset) {} - - Dimensions strides; - const Scalar* data; - IndexType offset; - }; - - // Copies data to `dst` from `src`, using provided dimensions mapping: - // - // src_dimension_index = dst_to_src_dim_map[dst_dimension_index] - // - // Returns the number of copied elements. - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy( - const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) { - // Copy single scalar value from `src` to `dst`. - if (NumDims == 0) { - *(dst.data + dst.offset) = *(src.data + src.offset); - return 1; - } - - // Both `dst` and `src` must have contiguous innermost dimension. We also - // accept the special case with stride '0', because it's used as a trick to - // implement broadcasting. - { - int inner_dim = IsColMajor ? 0 : NumDims - 1; - EIGEN_UNUSED_VARIABLE(inner_dim); - eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0); - eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0); - } - - // Give a shorter name to `dst_to_src_dim_map`. - const DimensionsMap& dim_map = dst_to_src_dim_map; - - // Do not squeeze reordered inner dimensions. - int num_squeezable_dims = NumSqueezableInnerDims(dim_map); - - // NOTE: We find the innermost dimension (contiguous in memory) in the dst - // block, and we write data linearly into that dimension, reading it from - // the src. If dimensions are reordered, we might end up reading data from - // the src with `stride != 1`. - // - // NOTE: Random-Read/Linear-Write can be up to ~2X faster than - // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680 - - // Find the innermost dimension in the dst whose size is not 1. This is the - // effective inner dim. - int num_size_one_inner_dims = 0; - for (int i = 0; i < num_squeezable_dims; ++i) { - const int dst_dim = IsColMajor ? i : NumDims - i - 1; - if (dst.dims[dst_dim] != 1) break; - num_size_one_inner_dims++; - } - - // If all dimensions are of size 1, just copy a scalar from `src` to `dst`. - if (num_size_one_inner_dims == NumDims) { - *(dst.data + dst.offset) = *(src.data + src.offset); - return 1; - } - - // Outermost dimension in the dst with `stride == 1` (contiguous in memory). - const int dst_stride1_dim = IsColMajor - ? num_size_one_inner_dims - : NumDims - num_size_one_inner_dims - 1; - - // Dimension in the src that corresponds to the dst innermost dimension. - const int src_dim_for_dst_stride1_dim = - NumDims == 0 ? 1 : dim_map[dst_stride1_dim]; - - // Size of the innermost dimension (length of contiguous blocks of memory). - IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim]; - - // Squeeze multiple inner dims into one if they are contiguous in `dst` and - // `src` memory, so we can do less linear copy calls. - for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) { - const int dst_dim = IsColMajor ? i : NumDims - i - 1; - const IndexType dst_stride = dst.strides[dst_dim]; - const IndexType src_stride = src.strides[dim_map[dst_dim]]; - if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) { - dst_inner_dim_size *= dst.dims[dst_dim]; - ++num_size_one_inner_dims; - } else { - break; - } - } - - // Setup strides to read data from `src` and write to `dst`. - IndexType input_offset = src.offset; - IndexType output_offset = dst.offset; - IndexType input_stride = - NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim]; - IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim]; - - const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; - array<BlockIteratorState, at_least_1_dim> it; - - // Initialize block iterator state. Squeeze away any dimension of size 1. - int idx = 0; // currently initialized iterator state index - for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { - const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2; - if (dst.dims[dst_dim] == 1) continue; - - it[idx].size = dst.dims[dst_dim]; - it[idx].input_stride = src.strides[dim_map[dst_dim]]; - it[idx].output_stride = dst.strides[dst_dim]; - - it[idx].input_span = it[idx].input_stride * (it[idx].size - 1); - it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); - - idx++; - } - - // Iterate copying data from src to dst. - const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize(); - -#define COPY_INNER_DIM(KIND) \ - IndexType num_copied = 0; \ - for (num_copied = 0; num_copied < block_total_size; \ - num_copied += dst_inner_dim_size) { \ - LinCopy::template Run<KIND>( \ - typename LinCopy::Dst(output_offset, output_stride, dst.data), \ - typename LinCopy::Src(input_offset, input_stride, src.data), \ - dst_inner_dim_size); \ - \ - for (int j = 0; j < idx; ++j) { \ - if (++it[j].count < it[j].size) { \ - input_offset += it[j].input_stride; \ - output_offset += it[j].output_stride; \ - break; \ - } \ - it[j].count = 0; \ - input_offset -= it[j].input_span; \ - output_offset -= it[j].output_span; \ - } \ - } \ - return num_copied; - - if (input_stride == 1 && output_stride == 1) { - COPY_INNER_DIM(LinCopy::Kind::Linear); - } else if (input_stride == 1 && output_stride != 1) { - COPY_INNER_DIM(LinCopy::Kind::Scatter); - } else if (input_stride == 0 && output_stride == 1) { - COPY_INNER_DIM(LinCopy::Kind::FillLinear); - } else if (input_stride == 0 && output_stride != 1) { - COPY_INNER_DIM(LinCopy::Kind::FillScatter); - } else if (output_stride == 1) { - COPY_INNER_DIM(LinCopy::Kind::Gather); - } else { - COPY_INNER_DIM(LinCopy::Kind::Random); - } - -#undef COPY_INNER_DIM - } - - // Copy from `src` to `dst` with an identity src->dst dimension map. Returns - // the number of copied elements. - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst, - const Src& src) { - DimensionsMap dst_to_src_map; - for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i; - return Copy(dst, src, dst_to_src_map); - } - - private: - struct BlockIteratorState { - BlockIteratorState() - : size(0), - count(0), - input_stride(0), - output_stride(0), - input_span(0), - output_span(0) {} - - IndexType size; - IndexType count; - IndexType input_stride; - IndexType output_stride; - IndexType input_span; - IndexType output_span; - }; - - // Compute how many inner dimensions it's allowed to squeeze when doing IO - // between two tensor blocks. It's safe to squeeze inner dimensions, only - // if they are not reordered. - static int NumSqueezableInnerDims(const DimensionsMap& dim_map) { - int num_squeezable_dims = 0; - for (int i = 0; i < NumDims; ++i) { - const int dim = IsColMajor ? i : NumDims - i - 1; - if (dim_map[dim] != dim) break; - num_squeezable_dims++; - } - return num_squeezable_dims; - } -}; - -// -------------------------------------------------------------------------- // -// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to -// a Tensor block defined by `desc`, backed by a memory buffer at `target`. -// -// Currently there is no way to write from a Tensor expression to a block of -// memory, if dimensions are reordered. If you need to do that, you should -// materialize a Tensor block expression into a memory buffer, and then use -// TensorBlockIO to copy data between two memory buffers with a custom -// `target->src` dimension map (see definition above). -// -// Also currently the innermost dimension of `target` must have a stride '1' -// (contiguous in memory). This restriction could be lifted with a `pscatter`, -// but in practice it's never needed, and there is a similar TensorBlockIO -// workaround for that. -// -// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO -// where `src` is a tensor expression. Explore if it is possible to rewrite IO -// to use expressions instead of pointers, and after that TensorBlockAssignment -// will become an alias to IO. -template <typename Scalar, int NumDims, typename TensorBlockExpr, - typename IndexType = Eigen::Index> -class TensorBlockAssignment { - // We will use coeff/packet path to evaluate block expressions. - typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice> - TensorBlockEvaluator; - - typedef DSizes<IndexType, NumDims> Dimensions; - - enum { - Vectorizable = packet_traits<Scalar>::Vectorizable, - PacketSize = packet_traits<Scalar>::size - }; - - template <bool Vectorizable, typename Evaluator> - struct InnerDimAssign { - EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, - const Evaluator& eval, - IndexType eval_offset) { - for (IndexType i = 0; i < count; ++i) { - target[i] = eval.coeff(eval_offset + i); - } - } - }; - - template <typename Evaluator> - struct InnerDimAssign<true, Evaluator> { - EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, - const Evaluator& eval, - IndexType eval_offset) { - typedef typename packet_traits<Scalar>::type Packet; - - const IndexType unrolled_size = count - 4 * PacketSize; - const IndexType vectorized_size = count - PacketSize; - IndexType i = 0; - - for (; i <= unrolled_size; i += 4 * PacketSize) { - for (int j = 0; j < 4; ++j) { - const IndexType idx = eval_offset + i + j * PacketSize; - Packet p = eval.template packet<Unaligned>(idx); - pstoreu<Scalar>(target + i + j * PacketSize, p); - } - } - - for (; i <= vectorized_size; i += PacketSize) { - Packet p = eval.template packet<Unaligned>(eval_offset + i); - pstoreu<Scalar>(target + i, p); - } - - for (; i < count; ++i) { - target[i] = eval.coeff(eval_offset + i); - } - } - }; - - public: - struct Target { - Target(const Dimensions& target_dims, const Dimensions& target_strides, - Scalar* target_data, IndexType target_offset = 0) - : dims(target_dims), - strides(target_strides), - data(target_data), - offset(target_offset) {} - - Dimensions dims; - Dimensions strides; - Scalar* data; - IndexType offset; - }; - - static Target target(const Dimensions& target_dims, - const Dimensions& target_strides, Scalar* target_data, - IndexType target_offset = 0) { - return Target(target_dims, target_strides, target_data, target_offset); - } - - template <typename TargetDimsIndexType, typename TargetStridesIndexType> - static Target target( - const DSizes<TargetDimsIndexType, NumDims>& target_dims, - const DSizes<TargetStridesIndexType, NumDims>& target_strides, - Scalar* target_data, IndexType target_offset = 0) { - // DSizes constructor will do index type promotion if it's safe. - return Target(Dimensions(target_dims), Dimensions(target_strides), - target_data, target_offset); - } - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const Target& target, const TensorBlockExpr& expr) { - // Prepare evaluator for block expression. - DefaultDevice default_device; - TensorBlockEvaluator eval(expr, default_device); - - // Tensor block expression dimension should match destination dimensions. - eigen_assert(dimensions_match(target.dims, eval.dimensions())); - - static const int Layout = TensorBlockEvaluator::Layout; - static const bool is_col_major = Layout == ColMajor; - - // Initialize output inner dimension size based on a layout. - const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize(); - const int inner_dim_idx = is_col_major ? 0 : NumDims - 1; - IndexType output_inner_dim_size = target.dims[inner_dim_idx]; - - // Target inner dimension stride must be '1'. - eigen_assert(target.strides[inner_dim_idx] == 1); - - // Squeeze multiple inner dims into one if they are contiguous in `target`. - IndexType num_squeezed_dims = 0; - for (Index i = 1; i < NumDims; ++i) { - const Index dim = is_col_major ? i : NumDims - i - 1; - const IndexType target_stride = target.strides[dim]; - - if (output_inner_dim_size == target_stride) { - output_inner_dim_size *= target.dims[dim]; - num_squeezed_dims++; - } else { - break; - } - } - - // Initialize output block iterator state. Dimension in this array are - // always in inner_most -> outer_most order (col major layout). - array<BlockIteratorState, NumDims> it; - - int idx = 0; // currently initialized iterator state index - for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) { - const Index dim = is_col_major ? i + 1 : NumDims - i - 2; - - it[idx].count = 0; - it[idx].size = target.dims[dim]; - it[idx].output_stride = target.strides[dim]; - it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); - idx++; - } - - // We read block expression from the beginning, and start writing data to - // `target` at given offset. - IndexType input_offset = 0; - IndexType output_offset = target.offset; - - // Iterate copying data from `eval` to `target`. - for (IndexType i = 0; i < output_size; i += output_inner_dim_size) { - // Assign to `target` at current offset. - InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess, - TensorBlockEvaluator>::Run(target.data + output_offset, - output_inner_dim_size, eval, - input_offset); - - // Move input offset forward by the number of assigned coefficients. - input_offset += output_inner_dim_size; - - // Update index. - for (int j = 0; j < idx; ++j) { - if (++it[j].count < it[j].size) { - output_offset += it[j].output_stride; - break; - } - it[j].count = 0; - output_offset -= it[j].output_span; - } - } - } - - private: - struct BlockIteratorState { - BlockIteratorState() - : count(0), size(0), output_stride(0), output_span(0) {} - - IndexType count; - IndexType size; - IndexType output_stride; - IndexType output_span; - }; -}; - -// -------------------------------------------------------------------------- // - -} // namespace internal -} // namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorBroadcasting.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorBroadcasting.h deleted file mode 100644 index a354132..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorBroadcasting.h +++ /dev/null @@ -1,1093 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H -#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H - -namespace Eigen { - -/** \class TensorBroadcasting - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor broadcasting class. - * - * - */ -namespace internal { -template<typename Broadcast, typename XprType> -struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType> -{ - typedef typename XprType::Scalar Scalar; - typedef traits<XprType> XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; -}; - -template<typename Broadcast, typename XprType> -struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense> -{ - typedef const TensorBroadcastingOp<Broadcast, XprType> EIGEN_DEVICE_REF type; -}; - -template<typename Broadcast, typename XprType> -struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorBroadcastingOp<Broadcast, XprType> >::type> -{ - typedef TensorBroadcastingOp<Broadcast, XprType> type; -}; - -template <typename Dims> -struct is_input_scalar { - static const bool value = false; -}; -template <> -struct is_input_scalar<Sizes<> > { - static const bool value = true; -}; -#ifndef EIGEN_EMULATE_CXX11_META_H -template <typename std::ptrdiff_t... Indices> -struct is_input_scalar<Sizes<Indices...> > { - static const bool value = (Sizes<Indices...>::total_size == 1); -}; -#endif - -} // end namespace internal - - - -template<typename Broadcast, typename XprType> -class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested<TensorBroadcastingOp>::type Nested; - typedef typename Eigen::internal::traits<TensorBroadcastingOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast) - : m_xpr(expr), m_broadcast(broadcast) {} - - EIGEN_DEVICE_FUNC - const Broadcast& broadcast() const { return m_broadcast; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const Broadcast m_broadcast; -}; - - -// Eval as rvalue -template<typename Broadcast, typename ArgType, typename Device> -struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> -{ - typedef TensorBroadcastingOp<Broadcast, ArgType> XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; - typedef DSizes<Index, NumDims> Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - protected: // all the non-static fields must have the same access control, otherwise the TensorEvaluator wont be standard layout; - bool isCopy, nByOne, oneByN; - public: - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess, - PreferBlockAccess = true, - Layout = TensorEvaluator<ArgType, Device>::Layout, - RawAccess = false - }; - - typedef typename internal::remove_const<Scalar>::type ScalarNoConst; - - // We do block based broadcasting using a trick with 2x tensor rank and 0 - // strides. See block method implementation for details. - typedef DSizes<Index, 2 * NumDims> BroadcastDimensions; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; - - typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock - ArgTensorBlock; - - typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims, - Layout, Index> - TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : isCopy(false), nByOne(false), oneByN(false), - m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device) - { - - // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar - // and store the result in a scalar. Instead one should reshape the scalar into a a N-D - // tensor with N >= 1 of 1 element first and then broadcast. - EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - const InputDimensions& input_dims = m_impl.dimensions(); - isCopy = true; - for (int i = 0; i < NumDims; ++i) { - eigen_assert(input_dims[i] > 0); - m_dimensions[i] = input_dims[i] * m_broadcast[i]; - if (m_broadcast[i] != 1) { - isCopy = false; - } - } - - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - m_inputStrides[0] = 1; - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - } - } else { - m_inputStrides[NumDims-1] = 1; - m_outputStrides[NumDims-1] = 1; - for (int i = NumDims-2; i >= 0; --i) { - m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; - m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; - } - } - - if (input_dims[0] == 1) { - oneByN = true; - for (int i = 1; i < NumDims; ++i) { - if (m_broadcast[i] != 1) { - oneByN = false; - break; - } - } - } else if (input_dims[NumDims-1] == 1) { - nByOne = true; - for (int i = 0; i < NumDims-1; ++i) { - if (m_broadcast[i] != 1) { - nByOne = false; - break; - } - } - } - - // Handle special format like NCHW, its input shape is '[1, N..., 1]' and - // broadcast shape is '[N, 1..., N]' - if (!oneByN && !nByOne) { - if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) { - nByOne = true; - oneByN = true; - for (int i = 1; i < NumDims-1; ++i) { - if (m_broadcast[i] != 1) { - nByOne = false; - oneByN = false; - break; - } - } - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - -#ifdef EIGEN_USE_THREADS - template <typename EvalSubExprsCallback> - EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType, EvalSubExprsCallback done) { - m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); - } -#endif // EIGEN_USE_THREADS - - EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const - { - if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) { - return m_impl.coeff(0); - } - - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - if (isCopy) { - return m_impl.coeff(index); - } else { - return coeffColMajor(index); - } - } else { - if (isCopy) { - return m_impl.coeff(index); - } else { - return coeffRowMajor(index); - } - } - } - - // TODO: attempt to speed this up. The integer divisions and modulo are slow - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const { - Index inputIndex = 0; - EIGEN_UNROLL_LOOP - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - if (internal::index_statically_eq<Broadcast>(i, 1)) { - eigen_assert(idx < m_impl.dimensions()[i]); - inputIndex += idx * m_inputStrides[i]; - } else { - if (internal::index_statically_eq<InputDimensions>(i, 1)) { - eigen_assert(idx % m_impl.dimensions()[i] == 0); - } else { - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; - } - } - index -= idx * m_outputStrides[i]; - } - if (internal::index_statically_eq<Broadcast>(0, 1)) { - eigen_assert(index < m_impl.dimensions()[0]); - inputIndex += index; - } else { - if (internal::index_statically_eq<InputDimensions>(0, 1)) { - eigen_assert(index % m_impl.dimensions()[0] == 0); - } else { - inputIndex += (index % m_impl.dimensions()[0]); - } - } - return inputIndex; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const - { - return m_impl.coeff(indexColMajor(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const { - Index inputIndex = 0; - EIGEN_UNROLL_LOOP - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i]; - if (internal::index_statically_eq<Broadcast>(i, 1)) { - eigen_assert(idx < m_impl.dimensions()[i]); - inputIndex += idx * m_inputStrides[i]; - } else { - if (internal::index_statically_eq<InputDimensions>(i, 1)) { - eigen_assert(idx % m_impl.dimensions()[i] == 0); - } else { - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; - } - } - index -= idx * m_outputStrides[i]; - } - if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) { - eigen_assert(index < m_impl.dimensions()[NumDims - 1]); - inputIndex += index; - } else { - if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) { - eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0); - } else { - inputIndex += (index % m_impl.dimensions()[NumDims - 1]); - } - } - return inputIndex; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const - { - return m_impl.coeff(indexRowMajor(index)); - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const - { - if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) { - return internal::pset1<PacketReturnType>(m_impl.coeff(0)); - } - - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - if (isCopy) { - #ifdef EIGEN_GPU_COMPILE_PHASE - // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing - // unaligned loads here. The reason is unclear though. - return m_impl.template packet<Unaligned>(index); - #else - return m_impl.template packet<LoadMode>(index); - #endif - } else if (oneByN && !nByOne) { - return packetNByOne<LoadMode>(index); - } else if (!oneByN && nByOne) { - return packetOneByN<LoadMode>(index); - } else if (oneByN && nByOne) { - return packetOneByNByOne<LoadMode>(index); - } else { - return packetColMajor<LoadMode>(index); - } - } else { - if (isCopy) { - #ifdef EIGEN_GPU_COMPILE_PHASE - // See above. - return m_impl.template packet<Unaligned>(index); - #else - return m_impl.template packet<LoadMode>(index); - #endif - } else if (oneByN && !nByOne) { - return packetOneByN<LoadMode>(index); - } else if (!oneByN && nByOne) { - return packetNByOne<LoadMode>(index); - } else if (oneByN && nByOne) { - return packetOneByNByOne<LoadMode>(index); - } else { - return packetRowMajor<LoadMode>(index); - } - } - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne - (Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; - Index startDim, endDim; - Index inputIndex, outputOffset, batchedIndex; - - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - startDim = NumDims - 1; - endDim = 1; - } else { - startDim = 0; - endDim = NumDims - 2; - } - - batchedIndex = index % m_outputStrides[startDim]; - inputIndex = batchedIndex / m_outputStrides[endDim]; - outputOffset = batchedIndex % m_outputStrides[endDim]; - - if (outputOffset + PacketSize <= m_outputStrides[endDim]) { - values[0] = m_impl.coeff(inputIndex); - return internal::pload1<PacketReturnType>(values); - } else { - EIGEN_UNROLL_LOOP - for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { - if (outputOffset + cur < m_outputStrides[endDim]) { - values[i] = m_impl.coeff(inputIndex); - } else { - ++inputIndex; - inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex); - values[i] = m_impl.coeff(inputIndex); - outputOffset = 0; - cur = 0; - } - } - return internal::pload<PacketReturnType>(values); - } - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - Index dim, inputIndex; - - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - dim = NumDims - 1; - } else { - dim = 0; - } - - inputIndex = index % m_inputStrides[dim]; - if (inputIndex + PacketSize <= m_inputStrides[dim]) { - return m_impl.template packet<Unaligned>(inputIndex); - } else { - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; - EIGEN_UNROLL_LOOP - for (int i = 0; i < PacketSize; ++i) { - if (inputIndex > m_inputStrides[dim]-1) { - inputIndex = 0; - } - values[i] = m_impl.coeff(inputIndex++); - } - return internal::pload<PacketReturnType>(values); - } - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; - Index dim, inputIndex, outputOffset; - - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - dim = 1; - } else { - dim = NumDims - 2; - } - - inputIndex = index / m_outputStrides[dim]; - outputOffset = index % m_outputStrides[dim]; - if (outputOffset + PacketSize <= m_outputStrides[dim]) { - values[0] = m_impl.coeff(inputIndex); - return internal::pload1<PacketReturnType>(values); - } else { - EIGEN_UNROLL_LOOP - for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { - if (outputOffset + cur < m_outputStrides[dim]) { - values[i] = m_impl.coeff(inputIndex); - } else { - values[i] = m_impl.coeff(++inputIndex); - outputOffset = 0; - cur = 0; - } - } - return internal::pload<PacketReturnType>(values); - } - } - - // Ignore the LoadMode and always use unaligned loads since we can't guarantee - // the alignment at compile time. - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - const Index originalIndex = index; - - Index inputIndex = 0; - EIGEN_UNROLL_LOOP - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - if (internal::index_statically_eq<Broadcast>(i, 1)) { - eigen_assert(idx < m_impl.dimensions()[i]); - inputIndex += idx * m_inputStrides[i]; - } else { - if (internal::index_statically_eq<InputDimensions>(i, 1)) { - eigen_assert(idx % m_impl.dimensions()[i] == 0); - } else { - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; - } - } - index -= idx * m_outputStrides[i]; - } - Index innermostLoc; - if (internal::index_statically_eq<Broadcast>(0, 1)) { - eigen_assert(index < m_impl.dimensions()[0]); - innermostLoc = index; - } else { - if (internal::index_statically_eq<InputDimensions>(0, 1)) { - eigen_assert(index % m_impl.dimensions()[0] == 0); - innermostLoc = 0; - } else { - innermostLoc = index % m_impl.dimensions()[0]; - } - } - inputIndex += innermostLoc; - - // Todo: this could be extended to the second dimension if we're not - // broadcasting alongside the first dimension, and so on. - if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) { - return m_impl.template packet<Unaligned>(inputIndex); - } else { - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; - values[0] = m_impl.coeff(inputIndex); - EIGEN_UNROLL_LOOP - for (int i = 1; i < PacketSize; ++i) { - if (innermostLoc + i < m_impl.dimensions()[0]) { - values[i] = m_impl.coeff(inputIndex+i); - } else { - values[i] = coeffColMajor(originalIndex+i); - } - } - PacketReturnType rslt = internal::pload<PacketReturnType>(values); - return rslt; - } - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - const Index originalIndex = index; - - Index inputIndex = 0; - EIGEN_UNROLL_LOOP - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i]; - if (internal::index_statically_eq<Broadcast>(i, 1)) { - eigen_assert(idx < m_impl.dimensions()[i]); - inputIndex += idx * m_inputStrides[i]; - } else { - if (internal::index_statically_eq<InputDimensions>(i, 1)) { - eigen_assert(idx % m_impl.dimensions()[i] == 0); - } else { - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; - } - } - index -= idx * m_outputStrides[i]; - } - Index innermostLoc; - if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) { - eigen_assert(index < m_impl.dimensions()[NumDims-1]); - innermostLoc = index; - } else { - if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) { - eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); - innermostLoc = 0; - } else { - innermostLoc = index % m_impl.dimensions()[NumDims-1]; - } - } - inputIndex += innermostLoc; - - // Todo: this could be extended to the second dimension if we're not - // broadcasting alongside the first dimension, and so on. - if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) { - return m_impl.template packet<Unaligned>(inputIndex); - } else { - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; - values[0] = m_impl.coeff(inputIndex); - EIGEN_UNROLL_LOOP - for (int i = 1; i < PacketSize; ++i) { - if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) { - values[i] = m_impl.coeff(inputIndex+i); - } else { - values[i] = coeffRowMajor(originalIndex+i); - } - } - PacketReturnType rslt = internal::pload<PacketReturnType>(values); - return rslt; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - double compute_cost = TensorOpCost::AddCost<Index>(); - if (!isCopy && NumDims > 0) { - EIGEN_UNROLL_LOOP - for (int i = NumDims - 1; i > 0; --i) { - compute_cost += TensorOpCost::DivCost<Index>(); - if (internal::index_statically_eq<Broadcast>(i, 1)) { - compute_cost += - TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>(); - } else { - if (!internal::index_statically_eq<InputDimensions>(i, 1)) { - compute_cost += TensorOpCost::MulCost<Index>() + - TensorOpCost::ModCost<Index>() + - TensorOpCost::AddCost<Index>(); - } - } - compute_cost += - TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>(); - } - } - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large - // tensors. But this might need further tuning. - const size_t target_size = m_device.firstLevelCacheSize(); - return internal::TensorBlockResourceRequirements::merge( - m_impl.getResourceRequirements(), - internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - BlockBroadcastingParams params = blockBroadcastingParams(desc); - - if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) { - return emptyBlock(); - } - - // Prepare storage for the materialized broadcasting result. - const typename TensorBlock::Storage block_storage = - TensorBlock::prepareStorage(desc, scratch); - ScalarNoConst* materialized_output = block_storage.data(); - - // We potentially will need to materialize input blocks. - size_t materialized_input_size = 0; - ScalarNoConst* materialized_input = NULL; - - // Initialize block broadcating iterator state for outer dimensions (outer - // with regard to bcast dimension). Dimension in this array are always in - // inner_most -> outer_most order (col major layout). - array<BlockBroadcastingIteratorState, NumDims> it; - int idx = 0; - - for (int i = params.inner_dim_count + 1; i < NumDims; ++i) { - const Index dim = IsColMajor ? i : NumDims - 1 - i; - it[idx].size = params.output_dims[dim]; - it[idx].count = 0; - it[idx].output_stride = m_outputStrides[dim]; - it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); - idx++; - } - - // Write output into the beginning of `materialized_output`. - Index output_offset = 0; - - // We will fill output block by broadcasting along the bcast dim, and - // iterating over outer dimension. - const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize(); - - for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) { - ScalarNoConst* bcast_output = materialized_output + num_output_coeffs; - Index bcast_offset = desc.offset() + output_offset; - - // Broadcast along the bcast dimension. - num_output_coeffs += BroadcastBlockAlongBcastDim( - params, bcast_offset, scratch, bcast_output, &materialized_input, - &materialized_input_size); - - // Switch to the next outer dimension. - for (int j = 0; j < idx; ++j) { - if (++it[j].count < it[j].size) { - output_offset += it[j].output_stride; - break; - } - it[j].count = 0; - output_offset -= it[j].output_span; - } - } - - return block_storage.AsTensorMaterializedBlock(); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - - const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } - - Broadcast functor() const { return m_broadcast; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind( - cl::sycl::handler& cgh) const { - m_impl.bind(cgh); - } -#endif - private: - static const bool IsColMajor = - static_cast<int>(Layout) == static_cast<int>(ColMajor); - - // We will build a general case block broadcasting on top of broadcasting - // primitive that will do broadcasting only for the inner dimension(s) along - // the first dimension smaller than the input size (it's called `bcast_dim`). - // - // Example: - // dim: 0 1 2 (ColMajor) - // input size: [9, 3, 6] - // block size: [9, 2, 6] - // - // We will compute broadcasted block by iterating over the outer dimensions - // before `bcast_dim` (only dimension `2` in this example) and computing - // broadcasts along the `bcast_dim` (dimension `1` in this example). - - // BlockBroadcastingParams holds precomputed parameters for broadcasting a - // single block along the broadcasting dimension. Sizes and strides along the - // `bcast_dim` might be invalid, they will be adjusted later in - // `BroadcastBlockAlongBcastDim`. - struct BlockBroadcastingParams { - Dimensions input_dims; // input expression dimensions - Dimensions output_dims; // output block sizes - Dimensions output_strides; // output block strides - - int inner_dim_count; // count inner dimensions matching in size - int bcast_dim; // broadcasting dimension index - Index bcast_dim_size; // broadcasting dimension size - Index inner_dim_size; // inner dimensions size - - // Block sizes and strides for the input block where all dimensions before - // `bcast_dim` are equal to `1`. - Dimensions input_block_sizes; - Dimensions input_block_strides; - - // Block sizes and strides for blocks with extra dimensions and strides `0`. - BroadcastDimensions bcast_block_sizes; - BroadcastDimensions bcast_block_strides; - BroadcastDimensions bcast_input_strides; - }; - - struct BlockBroadcastingIteratorState { - Index size; - Index count; - Index output_stride; - Index output_span; - }; - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams - blockBroadcastingParams(TensorBlockDesc& desc) const { - BlockBroadcastingParams params; - - params.input_dims = Dimensions(m_impl.dimensions()); - - // Output block sizes and strides. - params.output_dims = desc.dimensions(); - params.output_strides = internal::strides<Layout>(params.output_dims); - - // Find the broadcasting dimension (first dimension with output size smaller - // that the input size). - params.bcast_dim = 0; - params.bcast_dim_size = 1; - params.inner_dim_size = 1; - - // Count the number of inner dimensions that have the same size in the block - // and in the broadcast expression. - params.inner_dim_count = 0; - - for (int i = 0; i < NumDims; ++i) { - const int dim = IsColMajor ? i : NumDims - i - 1; - - if (params.output_dims[dim] == m_dimensions[dim]) { - params.inner_dim_size *= params.output_dims[dim]; - ++params.inner_dim_count; - continue; - } - - // First non-matching dimension is the broadcasting dimension. - eigen_assert(params.output_dims[dim] < m_dimensions[dim]); - params.bcast_dim = dim; - params.bcast_dim_size = params.output_dims[dim]; - break; - } - - // Calculate the input block size for looking into the input. - for (int i = 0; i < params.inner_dim_count; ++i) { - const int dim = IsColMajor ? i : NumDims - i - 1; - params.input_block_sizes[dim] = params.input_dims[dim]; - } - for (int i = params.inner_dim_count; i < NumDims; ++i) { - const int dim = IsColMajor ? i : NumDims - i - 1; - params.input_block_sizes[dim] = 1; - } - params.input_block_strides = - internal::strides<Layout>(params.input_block_sizes); - - // Broadcast with the 0-stride trick: Create 1 extra dim for each - // broadcast, set the input stride to 0. - // - // When ColMajor: - // - // - bcast_block_sizes: - // [d_0, b_0, d_1, b_1, ...] - // - // - bcast_block_strides: - // [output_block_strides[0], output_block_strides[0] * d_0, - // output_block_strides[1], output_block_strides[1] * d_1, - // ...] - // - // - bcast_input_strides: - // [input_block_strides[0], 0, - // input_block_strides[1], 0, - // ...]. - // - for (int i = 0; i < params.inner_dim_count; ++i) { - const int dim = IsColMajor ? i : NumDims - i - 1; - - const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1; - const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1; - - params.bcast_block_sizes[copy_dim] = params.input_dims[dim]; - params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim]; - params.bcast_block_strides[copy_dim] = params.output_strides[dim]; - params.bcast_block_strides[broadcast_dim] = - params.output_strides[dim] * params.input_dims[dim]; - params.bcast_input_strides[copy_dim] = params.input_block_strides[dim]; - params.bcast_input_strides[broadcast_dim] = 0; - } - - for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) { - const int dim = IsColMajor ? i : 2 * NumDims - i - 1; - params.bcast_block_sizes[dim] = 1; - params.bcast_block_strides[dim] = 0; - params.bcast_input_strides[dim] = 0; - } - - return params; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const { - DSizes<Index, NumDims> dimensions; - for (int i = 0; i < NumDims; ++i) dimensions[i] = 0; - return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim( - BlockBroadcastingParams params, Index bcast_offset, - TensorBlockScratch& scratch, ScalarNoConst* materialized_output, - ScalarNoConst** materialized_input, - size_t* materialized_input_size) const { - if (params.bcast_dim_size == 1) { - // We just need one block read using the ready-set values above. - return BroadcastBlock( - params.input_block_sizes, params.input_block_strides, - params.bcast_block_sizes, params.bcast_block_strides, - params.bcast_input_strides, bcast_offset, 0, scratch, - materialized_output, materialized_input, materialized_input_size); - - } else if (params.input_dims[params.bcast_dim] == 1) { - // Broadcast bcast dimension (< NumDims) by bcast_dim_size. - const int broadcast_bcast_dim = - IsColMajor ? 2 * params.inner_dim_count + 1 - : 2 * NumDims - 2 * params.inner_dim_count - 2; - - params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size; - params.bcast_input_strides[broadcast_bcast_dim] = 0; - params.bcast_block_strides[broadcast_bcast_dim] = - params.output_strides[params.bcast_dim]; - - return BroadcastBlock( - params.input_block_sizes, params.input_block_strides, - params.bcast_block_sizes, params.bcast_block_strides, - params.bcast_input_strides, bcast_offset, 0, scratch, - materialized_output, materialized_input, materialized_input_size); - - } else { - // Keep track of the total number of the coefficients written to the - // output block. - Index num_output_coeffs = 0; - - // The general case. Let's denote the output block as - // - // x[..., a:a+bcast_dim_size, :, ..., :] - // - // where a:a+bcast_dim_size is a slice on the bcast_dim dimension - // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3 - // sub-blocks: - // - // (1) a:b, where b is the smallest multiple of - // input_dims[bcast_dim_start] in [a, a+bcast_dim_size]. - // - // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start] - // in [a, a+bcast_dim_size]. - // - // (3) c:a+bcast_dim_size . - // - // Or, when b and c do not exist, we just need to process the whole block - // together. - - // Find a. - const Index bcast_dim_left_index = - bcast_offset / m_outputStrides[params.bcast_dim]; - - // Find b and c. - const Index input_bcast_dim_size = params.input_dims[params.bcast_dim]; - - // First multiple after a. This is b when <= bcast_dim_left_index + - // bcast_dim_size. - const Index first_multiple = - divup<Index>(bcast_dim_left_index, input_bcast_dim_size) * - input_bcast_dim_size; - - if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) { - // b exists, so does c. Find it. - const Index last_multiple = - (bcast_dim_left_index + params.bcast_dim_size) / - input_bcast_dim_size * input_bcast_dim_size; - const int copy_bcast_dim = - IsColMajor ? 2 * params.inner_dim_count - : 2 * NumDims - 2 * params.inner_dim_count - 1; - const int broadcast_bcast_dim = - IsColMajor ? 2 * params.inner_dim_count + 1 - : 2 * NumDims - 2 * params.inner_dim_count - 2; - - if (first_multiple > bcast_dim_left_index) { - const Index head_size = first_multiple - bcast_dim_left_index; - params.input_block_sizes[params.bcast_dim] = head_size; - params.bcast_block_sizes[copy_bcast_dim] = head_size; - params.bcast_input_strides[copy_bcast_dim] = - params.input_block_strides[params.bcast_dim]; - params.bcast_block_strides[copy_bcast_dim] = - params.output_strides[params.bcast_dim]; - params.bcast_block_sizes[broadcast_bcast_dim] = 1; - params.bcast_input_strides[broadcast_bcast_dim] = 0; - params.bcast_block_strides[broadcast_bcast_dim] = - params.output_strides[params.bcast_dim] * - params.input_dims[params.bcast_dim]; - - num_output_coeffs += BroadcastBlock( - params.input_block_sizes, params.input_block_strides, - params.bcast_block_sizes, params.bcast_block_strides, - params.bcast_input_strides, bcast_offset, 0, scratch, - materialized_output, materialized_input, materialized_input_size); - } - if (first_multiple < last_multiple) { - params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size; - params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size; - params.bcast_input_strides[copy_bcast_dim] = - params.input_block_strides[params.bcast_dim]; - params.bcast_block_strides[copy_bcast_dim] = - params.output_strides[params.bcast_dim]; - params.bcast_block_sizes[broadcast_bcast_dim] = - (last_multiple - first_multiple) / input_bcast_dim_size; - params.bcast_input_strides[broadcast_bcast_dim] = 0; - params.bcast_block_strides[broadcast_bcast_dim] = - params.output_strides[params.bcast_dim] * - params.input_dims[params.bcast_dim]; - const Index offset = (first_multiple - bcast_dim_left_index) * - m_outputStrides[params.bcast_dim]; - - num_output_coeffs += BroadcastBlock( - params.input_block_sizes, params.input_block_strides, - params.bcast_block_sizes, params.bcast_block_strides, - params.bcast_input_strides, bcast_offset, offset, scratch, - materialized_output, materialized_input, materialized_input_size); - } - if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) { - const Index tail_size = - bcast_dim_left_index + params.bcast_dim_size - last_multiple; - params.input_block_sizes[params.bcast_dim] = tail_size; - params.bcast_block_sizes[copy_bcast_dim] = tail_size; - params.bcast_input_strides[copy_bcast_dim] = - params.input_block_strides[params.bcast_dim]; - params.bcast_block_strides[copy_bcast_dim] = - params.output_strides[params.bcast_dim]; - params.bcast_block_sizes[broadcast_bcast_dim] = 1; - params.bcast_input_strides[broadcast_bcast_dim] = 0; - params.bcast_block_strides[broadcast_bcast_dim] = - params.output_strides[params.bcast_dim] * - params.input_dims[params.bcast_dim]; - const Index offset = (last_multiple - bcast_dim_left_index) * - m_outputStrides[params.bcast_dim]; - - num_output_coeffs += BroadcastBlock( - params.input_block_sizes, params.input_block_strides, - params.bcast_block_sizes, params.bcast_block_strides, - params.bcast_input_strides, bcast_offset, offset, scratch, - materialized_output, materialized_input, materialized_input_size); - } - } else { - // b and c do not exist. - const int copy_bcast_dim = - IsColMajor ? 2 * params.inner_dim_count - : 2 * NumDims - 2 * params.inner_dim_count - 1; - params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size; - params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size; - params.bcast_input_strides[copy_bcast_dim] = - params.input_block_strides[params.bcast_dim]; - params.bcast_block_strides[copy_bcast_dim] = - params.output_strides[params.bcast_dim]; - - num_output_coeffs += BroadcastBlock( - params.input_block_sizes, params.input_block_strides, - params.bcast_block_sizes, params.bcast_block_strides, - params.bcast_input_strides, bcast_offset, 0, scratch, - materialized_output, materialized_input, materialized_input_size); - } - - return num_output_coeffs; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock( - const Dimensions& input_block_sizes, - const Dimensions& input_block_strides, - const BroadcastDimensions& bcast_block_sizes, - const BroadcastDimensions& bcast_block_strides, - const BroadcastDimensions& bcast_input_strides, Index bcast_offset, - Index offset, TensorBlockScratch& scratch, - ScalarNoConst* materialized_output, ScalarNoConst** materialized_input, - size_t* materialized_input_size) const { - // ---------------------------------------------------------------------- // - // Tensor block descriptor for reading block from the input. - const Index input_offset = bcast_offset + offset; - TensorBlockDesc input_desc( - IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset), - input_block_sizes); - - ArgTensorBlock input_block = m_impl.block(input_desc, scratch); - - // ---------------------------------------------------------------------- // - // Materialize input block into a temporary memory buffer only if it's not - // already available in the arg block. - const ScalarNoConst* input_buffer = NULL; - - if (input_block.data() != NULL) { - // Input block already has raw data, there is no need to materialize it. - input_buffer = input_block.data(); - - } else { - // Otherwise we have to do block assignment into a temporary buffer. - - // Maybe reuse previously allocated buffer, or allocate a new one with a - // scratch allocator. - const size_t input_total_size = input_block_sizes.TotalSize(); - if (*materialized_input == NULL || - *materialized_input_size < input_total_size) { - *materialized_input_size = input_total_size; - void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar)); - *materialized_input = static_cast<ScalarNoConst*>(mem); - } - - typedef internal::TensorBlockAssignment< - ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index> - TensorBlockAssignment; - - TensorBlockAssignment::Run( - TensorBlockAssignment::target(input_block_sizes, input_block_strides, - *materialized_input), - input_block.expr()); - - input_buffer = *materialized_input; - } - - // ---------------------------------------------------------------------- // - // Copy data from materialized input block to the materialized output, using - // given broadcast strides (strides with zeroes). - typedef internal::TensorBlockIO<ScalarNoConst, Index, 2 * NumDims, Layout> - TensorBlockIO; - - typename TensorBlockIO::Src src(bcast_input_strides, input_buffer); - typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides, - materialized_output + offset); - - return TensorBlockIO::Copy(dst, src); - } - -protected: - const Device EIGEN_DEVICE_REF m_device; - const typename internal::remove_reference<Broadcast>::type m_broadcast; - Dimensions m_dimensions; - array<Index, NumDims> m_outputStrides; - array<Index, NumDims> m_inputStrides; - TensorEvaluator<ArgType, Device> m_impl; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorChipping.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorChipping.h deleted file mode 100644 index 3764573..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorChipping.h +++ /dev/null @@ -1,518 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H -#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H - -namespace Eigen { - -/** \class TensorKChippingReshaping - * \ingroup CXX11_Tensor_Module - * - * \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor. - * - * - */ - -namespace internal { -template<DenseIndex DimId, typename XprType> -struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType> -{ - typedef typename XprType::Scalar Scalar; - typedef traits<XprType> XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions - 1; - static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; -}; - -template<DenseIndex DimId, typename XprType> -struct eval<TensorChippingOp<DimId, XprType>, Eigen::Dense> -{ - typedef const TensorChippingOp<DimId, XprType> EIGEN_DEVICE_REF type; -}; - -template<DenseIndex DimId, typename XprType> -struct nested<TensorChippingOp<DimId, XprType>, 1, typename eval<TensorChippingOp<DimId, XprType> >::type> -{ - typedef TensorChippingOp<DimId, XprType> type; -}; - -template <DenseIndex DimId> -struct DimensionId -{ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) { - EIGEN_UNUSED_VARIABLE(dim); - eigen_assert(dim == DimId); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { - return DimId; - } -}; -template <> -struct DimensionId<Dynamic> -{ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) : actual_dim(dim) { - eigen_assert(dim >= 0); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { - return actual_dim; - } - private: - const DenseIndex actual_dim; -}; - - -} // end namespace internal - - - -template<DenseIndex DimId, typename XprType> -class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> > -{ - public: - typedef TensorBase<TensorChippingOp<DimId, XprType> > Base; - typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested; - typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim) - : m_xpr(expr), m_offset(offset), m_dim(dim) { - } - - EIGEN_DEVICE_FUNC - const Index offset() const { return m_offset; } - EIGEN_DEVICE_FUNC - const Index dim() const { return m_dim.actualDim(); } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_xpr; } - - EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorChippingOp) - - protected: - typename XprType::Nested m_xpr; - const Index m_offset; - const internal::DimensionId<DimId> m_dim; -}; - - -// Eval as rvalue -template<DenseIndex DimId, typename ArgType, typename Device> -struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> -{ - typedef TensorChippingOp<DimId, ArgType> XprType; - static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; - static const int NumDims = NumInputDims-1; - typedef typename XprType::Index Index; - typedef DSizes<Index, NumDims> Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - // Alignment can't be guaranteed at compile time since it depends on the - // slice offsets. - IsAligned = false, - Layout = TensorEvaluator<ArgType, Device>::Layout, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess, - // Chipping of outer-most dimension is a trivial operation, because we can - // read and write directly from the underlying tensor using single offset. - IsOuterChipping = (static_cast<int>(Layout) == ColMajor && DimId == NumInputDims - 1) || - (static_cast<int>(Layout) == RowMajor && DimId == 0), - // Chipping inner-most dimension. - IsInnerChipping = (static_cast<int>(Layout) == ColMajor && DimId == 0) || - (static_cast<int>(Layout) == RowMajor && DimId == NumInputDims - 1), - // Prefer block access if the underlying expression prefers it, otherwise - // only if chipping is not trivial. - PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess || - !IsOuterChipping, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - typedef typename internal::remove_const<Scalar>::type ScalarNoConst; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; - - typedef internal::TensorBlockDescriptor<NumInputDims, Index> - ArgTensorBlockDesc; - typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock - ArgTensorBlock; - - typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims, - Layout, Index> - TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) - { - EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); - eigen_assert(NumInputDims > m_dim.actualDim()); - - const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); - eigen_assert(op.offset() < input_dims[m_dim.actualDim()]); - - int j = 0; - for (int i = 0; i < NumInputDims; ++i) { - if (i != m_dim.actualDim()) { - m_dimensions[j] = input_dims[i]; - ++j; - } - } - - m_stride = 1; - m_inputStride = 1; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - for (int i = 0; i < m_dim.actualDim(); ++i) { - m_stride *= input_dims[i]; - m_inputStride *= input_dims[i]; - } - } else { - for (int i = NumInputDims-1; i > m_dim.actualDim(); --i) { - m_stride *= input_dims[i]; - m_inputStride *= input_dims[i]; - } - } - m_inputStride *= input_dims[m_dim.actualDim()]; - m_inputOffset = m_stride * op.offset(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - - EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_impl.coeff(srcCoeff(index)); - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - if (isInnerChipping()) { - // m_stride is equal to 1, so let's avoid the integer division. - eigen_assert(m_stride == 1); - Index inputIndex = index * m_inputStride + m_inputOffset; - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; - EIGEN_UNROLL_LOOP - for (int i = 0; i < PacketSize; ++i) { - values[i] = m_impl.coeff(inputIndex); - inputIndex += m_inputStride; - } - PacketReturnType rslt = internal::pload<PacketReturnType>(values); - return rslt; - } else if (isOuterChipping()) { - // m_stride is always greater than index, so let's avoid the integer division. - eigen_assert(m_stride > index); - return m_impl.template packet<LoadMode>(index + m_inputOffset); - } else { - const Index idx = index / m_stride; - const Index rem = index - idx * m_stride; - if (rem + PacketSize <= m_stride) { - Index inputIndex = idx * m_inputStride + m_inputOffset + rem; - return m_impl.template packet<LoadMode>(inputIndex); - } else { - // Cross the stride boundary. Fallback to slow path. - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; - EIGEN_UNROLL_LOOP - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index); - ++index; - } - PacketReturnType rslt = internal::pload<PacketReturnType>(values); - return rslt; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - double cost = 0; - if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && - m_dim.actualDim() == 0) || - (static_cast<int>(Layout) == static_cast<int>(RowMajor) && - m_dim.actualDim() == NumInputDims - 1)) { - cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>(); - } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && - m_dim.actualDim() == NumInputDims - 1) || - (static_cast<int>(Layout) == static_cast<int>(RowMajor) && - m_dim.actualDim() == 0)) { - cost += TensorOpCost::AddCost<Index>(); - } else { - cost += 3 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>() + - 3 * TensorOpCost::AddCost<Index>(); - } - - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - const size_t target_size = m_device.lastLevelCacheSize(); - return internal::TensorBlockResourceRequirements::merge( - internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size), - m_impl.getResourceRequirements()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool root_of_expr_ast = false) const { - const Index chip_dim = m_dim.actualDim(); - - DSizes<Index, NumInputDims> input_block_dims; - for (int i = 0; i < NumInputDims; ++i) { - input_block_dims[i] - = i < chip_dim ? desc.dimension(i) - : i > chip_dim ? desc.dimension(i - 1) - : 1; - } - - ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims); - - // Try to reuse destination buffer for materializing argument block. - if (desc.HasDestinationBuffer()) { - DSizes<Index, NumInputDims> arg_destination_strides; - for (int i = 0; i < NumInputDims; ++i) { - arg_destination_strides[i] - = i < chip_dim ? desc.destination().strides()[i] - : i > chip_dim ? desc.destination().strides()[i - 1] - : 0; // for dimensions of size `1` stride should never be used. - } - - arg_desc.template AddDestinationBuffer<Layout>( - desc.destination().template data<ScalarNoConst>(), - arg_destination_strides); - } - - ArgTensorBlock arg_block = m_impl.block(arg_desc, scratch, root_of_expr_ast); - if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer(); - - if (arg_block.data() != NULL) { - // Forward argument block buffer if possible. - return TensorBlock(arg_block.kind(), arg_block.data(), - desc.dimensions()); - - } else { - // Assign argument block expression to a buffer. - - // Prepare storage for the materialized chipping result. - const typename TensorBlock::Storage block_storage = - TensorBlock::prepareStorage(desc, scratch); - - typedef internal::TensorBlockAssignment< - ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index> - TensorBlockAssignment; - - TensorBlockAssignment::Run( - TensorBlockAssignment::target( - arg_desc.dimensions(), - internal::strides<Layout>(arg_desc.dimensions()), - block_storage.data()), - arg_block.expr()); - - return block_storage.AsTensorMaterializedBlock(); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { - typename Storage::Type result = constCast(m_impl.data()); - if (isOuterChipping() && result) { - return result + m_inputOffset; - } else { - return NULL; - } - } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - - protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const - { - Index inputIndex; - if (isInnerChipping()) { - // m_stride is equal to 1, so let's avoid the integer division. - eigen_assert(m_stride == 1); - inputIndex = index * m_inputStride + m_inputOffset; - } else if (isOuterChipping()) { - // m_stride is always greater than index, so let's avoid the integer - // division. - eigen_assert(m_stride > index); - inputIndex = index + m_inputOffset; - } else { - const Index idx = index / m_stride; - inputIndex = idx * m_inputStride + m_inputOffset; - index -= idx * m_stride; - inputIndex += index; - } - return inputIndex; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isInnerChipping() const { - return IsInnerChipping || - (static_cast<int>(Layout) == ColMajor && m_dim.actualDim() == 0) || - (static_cast<int>(Layout) == RowMajor && m_dim.actualDim() == NumInputDims - 1); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isOuterChipping() const { - return IsOuterChipping || - (static_cast<int>(Layout) == ColMajor && m_dim.actualDim() == NumInputDims-1) || - (static_cast<int>(Layout) == RowMajor && m_dim.actualDim() == 0); - } - - Dimensions m_dimensions; - Index m_stride; - Index m_inputOffset; - Index m_inputStride; - TensorEvaluator<ArgType, Device> m_impl; - const internal::DimensionId<DimId> m_dim; - const Device EIGEN_DEVICE_REF m_device; -}; - - -// Eval as lvalue -template<DenseIndex DimId, typename ArgType, typename Device> -struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> - : public TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> -{ - typedef TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> Base; - typedef TensorChippingOp<DimId, ArgType> XprType; - static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; - static const int NumDims = NumInputDims-1; - typedef typename XprType::Index Index; - typedef DSizes<Index, NumDims> Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - - enum { - IsAligned = false, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess, - Layout = TensorEvaluator<ArgType, Device>::Layout, - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) - { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) - { - return this->m_impl.coeffRef(this->srcCoeff(index)); - } - - template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - - if (this->isInnerChipping()) { - // m_stride is equal to 1, so let's avoid the integer division. - eigen_assert(this->m_stride == 1); - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; - internal::pstore<CoeffReturnType, PacketReturnType>(values, x); - Index inputIndex = index * this->m_inputStride + this->m_inputOffset; - EIGEN_UNROLL_LOOP - for (int i = 0; i < PacketSize; ++i) { - this->m_impl.coeffRef(inputIndex) = values[i]; - inputIndex += this->m_inputStride; - } - } else if (this->isOuterChipping()) { - // m_stride is always greater than index, so let's avoid the integer division. - eigen_assert(this->m_stride > index); - this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x); - } else { - const Index idx = index / this->m_stride; - const Index rem = index - idx * this->m_stride; - if (rem + PacketSize <= this->m_stride) { - const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem; - this->m_impl.template writePacket<StoreMode>(inputIndex, x); - } else { - // Cross stride boundary. Fallback to slow path. - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; - internal::pstore<CoeffReturnType, PacketReturnType>(values, x); - EIGEN_UNROLL_LOOP - for (int i = 0; i < PacketSize; ++i) { - this->coeffRef(index) = values[i]; - ++index; - } - } - } - } - - template <typename TensorBlock> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( - const TensorBlockDesc& desc, const TensorBlock& block) { - assert(this->m_impl.data() != NULL); - - const Index chip_dim = this->m_dim.actualDim(); - - DSizes<Index, NumInputDims> input_block_dims; - for (int i = 0; i < NumInputDims; ++i) { - input_block_dims[i] = i < chip_dim ? desc.dimension(i) - : i > chip_dim ? desc.dimension(i - 1) - : 1; - } - - typedef TensorReshapingOp<const DSizes<Index, NumInputDims>, - const typename TensorBlock::XprType> - TensorBlockExpr; - - typedef internal::TensorBlockAssignment<Scalar, NumInputDims, - TensorBlockExpr, Index> - TensorBlockAssign; - - TensorBlockAssign::Run( - TensorBlockAssign::target( - input_block_dims, - internal::strides<Layout>(this->m_impl.dimensions()), - this->m_impl.data(), this->srcCoeff(desc.offset())), - block.expr().reshape(input_block_dims)); - } -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorConcatenation.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorConcatenation.h deleted file mode 100644 index 5235a8e..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorConcatenation.h +++ /dev/null @@ -1,377 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H - -namespace Eigen { - -/** \class TensorConcatenationOp - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor concatenation class. - * - * - */ -namespace internal { -template<typename Axis, typename LhsXprType, typename RhsXprType> -struct traits<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename promote_storage_type<typename LhsXprType::Scalar, - typename RhsXprType::Scalar>::ret Scalar; - typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind, - typename traits<RhsXprType>::StorageKind>::ret StorageKind; - typedef typename promote_index_type<typename traits<LhsXprType>::Index, - typename traits<RhsXprType>::Index>::type Index; - typedef typename LhsXprType::Nested LhsNested; - typedef typename RhsXprType::Nested RhsNested; - typedef typename remove_reference<LhsNested>::type _LhsNested; - typedef typename remove_reference<RhsNested>::type _RhsNested; - static const int NumDimensions = traits<LhsXprType>::NumDimensions; - static const int Layout = traits<LhsXprType>::Layout; - enum { Flags = 0 }; - typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val, - typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType; -}; - -template<typename Axis, typename LhsXprType, typename RhsXprType> -struct eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, Eigen::Dense> -{ - typedef const TensorConcatenationOp<Axis, LhsXprType, RhsXprType>& type; -}; - -template<typename Axis, typename LhsXprType, typename RhsXprType> -struct nested<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, 1, typename eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >::type> -{ - typedef TensorConcatenationOp<Axis, LhsXprType, RhsXprType> type; -}; - -} // end namespace internal - - -template<typename Axis, typename LhsXprType, typename RhsXprType> -class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors> -{ - public: - typedef TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors> Base; - typedef typename internal::traits<TensorConcatenationOp>::Scalar Scalar; - typedef typename internal::traits<TensorConcatenationOp>::StorageKind StorageKind; - typedef typename internal::traits<TensorConcatenationOp>::Index Index; - typedef typename internal::nested<TensorConcatenationOp>::type Nested; - typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType, - typename RhsXprType::CoeffReturnType>::ret CoeffReturnType; - typedef typename NumTraits<Scalar>::Real RealScalar; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis) - : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename LhsXprType::Nested>::type& - lhsExpression() const { return m_lhs_xpr; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename RhsXprType::Nested>::type& - rhsExpression() const { return m_rhs_xpr; } - - EIGEN_DEVICE_FUNC const Axis& axis() const { return m_axis; } - - EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorConcatenationOp) - protected: - typename LhsXprType::Nested m_lhs_xpr; - typename RhsXprType::Nested m_rhs_xpr; - const Axis m_axis; -}; - - -// Eval as rvalue -template<typename Axis, typename LeftArgType, typename RightArgType, typename Device> -struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device> -{ - typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size<typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value; - static const int RightNumDims = internal::array_size<typename TensorEvaluator<RightArgType, Device>::Dimensions>::value; - typedef DSizes<Index, NumDims> Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - enum { - IsAligned = false, - PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess && - TensorEvaluator<RightArgType, Device>::PacketAccess, - BlockAccess = false, - PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess || - TensorEvaluator<RightArgType, Device>::PreferBlockAccess, - Layout = TensorEvaluator<LeftArgType, Device>::Layout, - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis()) - { - EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT((NumDims == RightNumDims), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - - eigen_assert(0 <= m_axis && m_axis < NumDims); - const Dimensions& lhs_dims = m_leftImpl.dimensions(); - const Dimensions& rhs_dims = m_rightImpl.dimensions(); - { - int i = 0; - for (; i < m_axis; ++i) { - eigen_assert(lhs_dims[i] > 0); - eigen_assert(lhs_dims[i] == rhs_dims[i]); - m_dimensions[i] = lhs_dims[i]; - } - eigen_assert(lhs_dims[i] > 0); // Now i == m_axis. - eigen_assert(rhs_dims[i] > 0); - m_dimensions[i] = lhs_dims[i] + rhs_dims[i]; - for (++i; i < NumDims; ++i) { - eigen_assert(lhs_dims[i] > 0); - eigen_assert(lhs_dims[i] == rhs_dims[i]); - m_dimensions[i] = lhs_dims[i]; - } - } - - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - m_leftStrides[0] = 1; - m_rightStrides[0] = 1; - m_outputStrides[0] = 1; - - for (int j = 1; j < NumDims; ++j) { - m_leftStrides[j] = m_leftStrides[j-1] * lhs_dims[j-1]; - m_rightStrides[j] = m_rightStrides[j-1] * rhs_dims[j-1]; - m_outputStrides[j] = m_outputStrides[j-1] * m_dimensions[j-1]; - } - } else { - m_leftStrides[NumDims - 1] = 1; - m_rightStrides[NumDims - 1] = 1; - m_outputStrides[NumDims - 1] = 1; - - for (int j = NumDims - 2; j >= 0; --j) { - m_leftStrides[j] = m_leftStrides[j+1] * lhs_dims[j+1]; - m_rightStrides[j] = m_rightStrides[j+1] * rhs_dims[j+1]; - m_outputStrides[j] = m_outputStrides[j+1] * m_dimensions[j+1]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear? - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) - { - m_leftImpl.evalSubExprsIfNeeded(NULL); - m_rightImpl.evalSubExprsIfNeeded(NULL); - return true; - } - - EIGEN_STRONG_INLINE void cleanup() - { - m_leftImpl.cleanup(); - m_rightImpl.cleanup(); - } - - // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow. - // See CL/76180724 comments for more ideas. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - // Collect dimension-wise indices (subs). - array<Index, NumDims> subs; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - subs[i] = index / m_outputStrides[i]; - index -= subs[i] * m_outputStrides[i]; - } - subs[0] = index; - } else { - for (int i = 0; i < NumDims - 1; ++i) { - subs[i] = index / m_outputStrides[i]; - index -= subs[i] * m_outputStrides[i]; - } - subs[NumDims - 1] = index; - } - - const Dimensions& left_dims = m_leftImpl.dimensions(); - if (subs[m_axis] < left_dims[m_axis]) { - Index left_index; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - left_index = subs[0]; - EIGEN_UNROLL_LOOP - for (int i = 1; i < NumDims; ++i) { - left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; - } - } else { - left_index = subs[NumDims - 1]; - EIGEN_UNROLL_LOOP - for (int i = NumDims - 2; i >= 0; --i) { - left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; - } - } - return m_leftImpl.coeff(left_index); - } else { - subs[m_axis] -= left_dims[m_axis]; - const Dimensions& right_dims = m_rightImpl.dimensions(); - Index right_index; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - right_index = subs[0]; - EIGEN_UNROLL_LOOP - for (int i = 1; i < NumDims; ++i) { - right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; - } - } else { - right_index = subs[NumDims - 1]; - EIGEN_UNROLL_LOOP - for (int i = NumDims - 2; i >= 0; --i) { - right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; - } - } - return m_rightImpl.coeff(right_index); - } - } - - // TODO(phli): Add a real vectorization. - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - const int packetSize = PacketType<CoeffReturnType, Device>::size; - EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); - - EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; - EIGEN_UNROLL_LOOP - for (int i = 0; i < packetSize; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload<PacketReturnType>(values); - return rslt; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() + - 2 * TensorOpCost::MulCost<Index>() + - TensorOpCost::DivCost<Index>() + - TensorOpCost::ModCost<Index>()); - const double lhs_size = m_leftImpl.dimensions().TotalSize(); - const double rhs_size = m_rightImpl.dimensions().TotalSize(); - return (lhs_size / (lhs_size + rhs_size)) * - m_leftImpl.costPerCoeff(vectorized) + - (rhs_size / (lhs_size + rhs_size)) * - m_rightImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - - #ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_leftImpl.bind(cgh); - m_rightImpl.bind(cgh); - } - #endif - - protected: - Dimensions m_dimensions; - array<Index, NumDims> m_outputStrides; - array<Index, NumDims> m_leftStrides; - array<Index, NumDims> m_rightStrides; - TensorEvaluator<LeftArgType, Device> m_leftImpl; - TensorEvaluator<RightArgType, Device> m_rightImpl; - const Axis m_axis; -}; - -// Eval as lvalue -template<typename Axis, typename LeftArgType, typename RightArgType, typename Device> - struct TensorEvaluator<TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device> - : public TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device> -{ - typedef TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device> Base; - typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType; - typedef typename Base::Dimensions Dimensions; - enum { - IsAligned = false, - PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess && - TensorEvaluator<RightArgType, Device>::PacketAccess, - BlockAccess = false, - PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess || - TensorEvaluator<RightArgType, Device>::PreferBlockAccess, - Layout = TensorEvaluator<LeftArgType, Device>::Layout, - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device) - : Base(op, device) - { - EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); - } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) - { - // Collect dimension-wise indices (subs). - array<Index, Base::NumDims> subs; - for (int i = Base::NumDims - 1; i > 0; --i) { - subs[i] = index / this->m_outputStrides[i]; - index -= subs[i] * this->m_outputStrides[i]; - } - subs[0] = index; - - const Dimensions& left_dims = this->m_leftImpl.dimensions(); - if (subs[this->m_axis] < left_dims[this->m_axis]) { - Index left_index = subs[0]; - for (int i = 1; i < Base::NumDims; ++i) { - left_index += (subs[i] % left_dims[i]) * this->m_leftStrides[i]; - } - return this->m_leftImpl.coeffRef(left_index); - } else { - subs[this->m_axis] -= left_dims[this->m_axis]; - const Dimensions& right_dims = this->m_rightImpl.dimensions(); - Index right_index = subs[0]; - for (int i = 1; i < Base::NumDims; ++i) { - right_index += (subs[i] % right_dims[i]) * this->m_rightStrides[i]; - } - return this->m_rightImpl.coeffRef(right_index); - } - } - - template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - const int packetSize = PacketType<CoeffReturnType, Device>::size; - EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize()); - - EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; - internal::pstore<CoeffReturnType, PacketReturnType>(values, x); - for (int i = 0; i < packetSize; ++i) { - coeffRef(index+i) = values[i]; - } - } -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContraction.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContraction.h deleted file mode 100644 index 8b35f79..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorContraction.h +++ /dev/null @@ -1,1023 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H - -namespace Eigen { - -/** \class TensorContraction - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor contraction class. - * - * - */ -namespace internal { - -template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType> -struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename gebp_traits<typename remove_const<typename LhsXprType::Scalar>::type, - typename remove_const<typename RhsXprType::Scalar>::type>::ResScalar Scalar; - - typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind, - typename traits<RhsXprType>::StorageKind>::ret StorageKind; - typedef typename promote_index_type<typename traits<LhsXprType>::Index, - typename traits<RhsXprType>::Index>::type Index; - typedef typename LhsXprType::Nested LhsNested; - typedef typename RhsXprType::Nested RhsNested; - typedef typename remove_reference<LhsNested>::type _LhsNested; - typedef typename remove_reference<RhsNested>::type _RhsNested; - - // From NumDims below. - static const int NumDimensions = traits<LhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value; - static const int Layout = traits<LhsXprType>::Layout; - typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val, - typename traits<LhsXprType>::PointerType, - typename traits<RhsXprType>::PointerType>::type - PointerType; - - enum { - Flags = 0 - }; -}; - -template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType> -struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, Eigen::Dense> -{ - typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>& type; -}; - -template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType> -struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> >::type> -{ - typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> type; -}; - -template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename OutputKernelType_, typename Device_> -struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_, OutputKernelType_>, Device_> > { - typedef Indices_ Indices; - typedef LeftArgType_ LeftArgType; - typedef RightArgType_ RightArgType; - typedef OutputKernelType_ OutputKernelType; - typedef Device_ Device; - - // From NumDims below. - static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value; -}; - -// Helper class to allocate and deallocate temporary memory for packed buffers. -template <typename LhsScalar, typename RhsScalar> -struct TensorContractionBlockMemAllocator { - typedef void* BlockMemHandle; - - template <typename Device> - EIGEN_DEVICE_FUNC static BlockMemHandle allocate(Device& d, const Index bm, - const Index bk, - const Index bn, - LhsScalar** lhs_block, - RhsScalar** rhs_block) { - eigen_assert(lhs_block); - eigen_assert(rhs_block); - BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn); - char* block_mem = static_cast<char*>(d.allocate(sz.lhs_size + sz.rhs_size)); - eigen_assert(block_mem); - *lhs_block = reinterpret_cast<LhsScalar*>(block_mem); - *rhs_block = reinterpret_cast<RhsScalar*>(block_mem + sz.lhs_size); - return block_mem; - } - - template <typename Device> - EIGEN_DEVICE_FUNC static BlockMemHandle allocateSlices( - Device& d, const Index bm, const Index bk, const Index bn, - const Index num_lhs, const Index num_rhs, const Index num_slices, - std::vector<LhsScalar*>* lhs_blocks, - std::vector<RhsScalar*>* rhs_blocks) { - eigen_assert(num_slices > 0); - eigen_assert(num_lhs >= 0 && num_rhs >= 0); - eigen_assert(num_lhs == 0 || lhs_blocks); - eigen_assert(num_rhs == 0 || rhs_blocks); - BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn); - void* block_mem = d.allocate( - (num_lhs * sz.lhs_size + num_rhs * sz.rhs_size) * num_slices); - eigen_assert(block_mem); - char* mem = static_cast<char*>(block_mem); - - for (Index x = 0; x < num_slices; x++) { - if (num_lhs > 0) lhs_blocks[x].resize(num_lhs); - for (Index m = 0; m < num_lhs; m++) { - lhs_blocks[x][m] = reinterpret_cast<LhsScalar*>(mem); - mem += sz.lhs_size; - } - if (num_rhs > 0) rhs_blocks[x].resize(num_rhs); - for (Index n = 0; n < num_rhs; n++) { - rhs_blocks[x][n] = reinterpret_cast<RhsScalar*>(mem); - mem += sz.rhs_size; - } - } - - return block_mem; - } - - template <typename Device> - EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) { - d.deallocate(handle); - } - - private: - struct BlockSizes { - Index lhs_size; - Index rhs_size; - }; - EIGEN_DEVICE_FUNC static BlockSizes ComputeLhsRhsBlockSizes(const Index bm, - const Index bk, - const Index bn) { - Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); - BlockSizes sz; - sz.lhs_size = divup<Index>(bm * bk * sizeof(LhsScalar), align) * align; - sz.rhs_size = divup<Index>(bn * bk * sizeof(RhsScalar), align) * align; - return sz; - } -}; - -// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in -// ColMajor storage order. This property is guaranteed by the -// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack -// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix -// multiplication for these blocks. Default tensor contraction uses -// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see -// GeneralBlocPanelKernel.h for details). -// -// By specializing contraction kernels we can use other low level libraries to -// perform matrix multiplication, and still rely on Eigen contraction evaluator. -// This also includes full support in TensorContractionThreadPool, assuming that -// underlying gemm do not use it's own threading. -// -// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of -// multiplication, lhs tensor and rhs tensor respectively. -// -// - StorageIndex - index type for the tensor expressions. In practice almost -// always is Eigen::Index. -// -// - OutputMapper provides access to the memory of the output matrix. In -// practice it's always column major blas_data_mapper (it must be of ResScalar -// type). -// -// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional -// view into the Lhs/Rhs tensor expressions. In practice it's -// TensorContractionInputMapper, or some specialization of it based on the -// type of tensor expression (e.g. TensorImagePatchOp has optimized input -// mapper). -template <typename ResScalar, typename LhsScalar, typename RhsScalar, - typename StorageIndex, typename OutputMapper, typename LhsMapper, - typename RhsMapper> -struct TensorContractionKernel { - // True if `invoke()` supports `beta` in `C <- alpha * A * B + beta * C` - // (otherwise beta should be always equal to 1). - enum { HasBeta = false }; - - EIGEN_DEVICE_FUNC - TensorContractionKernel(StorageIndex m_, StorageIndex k_, StorageIndex n_, - StorageIndex bm_, StorageIndex bk_, StorageIndex bn_) - : m(m_), k(k_), n(n_), bm(bm_), bk(bk_), bn(bn_) {} - - // Pack blocks of Lhs and Rhs into contiguous blocks in memory. - typedef LhsScalar* LhsBlock; - typedef RhsScalar* RhsBlock; - - // Packed Lhs/Rhs block memory allocator. - typedef TensorContractionBlockMemAllocator<LhsScalar, RhsScalar> - BlockMemAllocator; - typedef typename BlockMemAllocator::BlockMemHandle BlockMemHandle; - - typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits; - - typedef internal::gemm_pack_lhs< - LhsScalar, StorageIndex, typename LhsMapper::SubMapper, Traits::mr, - Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor> - LhsPacker; - - typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex, - typename RhsMapper::SubMapper, Traits::nr, - ColMajor> - RhsPacker; - - typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex, - OutputMapper, Traits::mr, Traits::nr, - /*ConjugateLhs*/ false, /*ConjugateRhs*/ false> - GebpKernel; - - template <typename Device> - EIGEN_DEVICE_FUNC BlockMemHandle allocate(Device& d, LhsBlock* lhs_block, - RhsBlock* rhs_block) { - return BlockMemAllocator::allocate(d, bm, bk, bn, lhs_block, rhs_block); - } - - template <typename Device> - EIGEN_DEVICE_FUNC BlockMemHandle allocateSlices( - Device& d, const StorageIndex num_lhs, const StorageIndex num_rhs, - const StorageIndex num_slices, std::vector<LhsBlock>* lhs_blocks, - std::vector<RhsBlock>* rhs_blocks) { - return BlockMemAllocator::allocateSlices( - d, bm, bk, bn, num_lhs, num_rhs, num_slices, lhs_blocks, rhs_blocks); - } - - template <typename Device> - EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) { - BlockMemAllocator::deallocate(d, handle); - } - - EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packLhs( - LhsBlock* lhsBlock, const typename LhsMapper::SubMapper& data_mapper, - const StorageIndex depth, const StorageIndex rows) { - LhsPacker()(*lhsBlock, data_mapper, depth, rows, /*stride*/ 0, - /*offset*/ 0); - } - - EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packRhs( - RhsBlock* rhsBlock, const typename RhsMapper::SubMapper& data_mapper, - const StorageIndex depth, const StorageIndex cols) { - RhsPacker()(*rhsBlock, data_mapper, depth, cols); - } - - EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void invoke( - const OutputMapper& output_mapper, const LhsBlock& lhsBlock, - const RhsBlock& rhsBlock, const StorageIndex rows, - const StorageIndex depth, const StorageIndex cols, - const ResScalar alpha, const ResScalar beta) { - // Default GEBP kernel does not support beta. - eigen_assert(beta == ResScalar(1)); - static const int kComputeStrideFromBlockDimensions = -1; - GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha, - /*strideA*/ kComputeStrideFromBlockDimensions, - /*strideB*/ kComputeStrideFromBlockDimensions, - /*offsetA*/ 0, /*offsetB*/ 0); - } - - private: - // These are dimensions of the original Tensors, and selected block sizes. The - // actual block sizes passed to all function above might be smaller because of - // the partial blocks at the end. - const StorageIndex m; - const StorageIndex k; - const StorageIndex n; - const StorageIndex bm; - const StorageIndex bk; - const StorageIndex bn; -}; - -} // end namespace internal - -// Tensor contraction params that should enable to get from output matrix -// 2-dimensional coordinates to the output tensor dimensions. -struct TensorContractionParams { - // TensorContraction evaluator assumes that both tensors are in ColMajor - // layout, if tensors are in RowMajor evaluator swap lhs with rhs. - bool swapped_arguments; -}; - -// Output kernel allows to fuse operations into the tensor contraction. -// -// Examples: -// 1. Elementwise Relu transformation following Conv2D. -// 2. AddBias to the Conv2D output channels dimension. -// -// The NoOpOutputKernel implements an output kernel that does absolutely nothing. -struct NoOpOutputKernel { - /** - * Tensor contraction evaluator calls this kernel after finishing each block - * of output matrix. Output blocks belong to the 2-dimensional output tensor. - * - * TensorContractionParams contains contraction dimensions information - * required to map output 2-d space into the expected output tensor space - * (potentially higher dimensional). - * - * \param[in] output_mapper Access to output tensor memory - * \param[in] params Tensor contraction parameters - * \param[in] i Index of a first row available through output_mapper - * \param[in] j Index of a first column available through output_mapper - * \param[in] num_rows Number of available rows - * \param[in] num_cols Number of available columns - */ - template <typename Index, typename Scalar> - EIGEN_ALWAYS_INLINE void operator()( - const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper, - const TensorContractionParams& params, Index i, - Index j, Index num_rows, Index num_cols) const { - EIGEN_UNUSED_VARIABLE(output_mapper); - EIGEN_UNUSED_VARIABLE(params); - EIGEN_UNUSED_VARIABLE(i); - EIGEN_UNUSED_VARIABLE(j); - EIGEN_UNUSED_VARIABLE(num_rows); - EIGEN_UNUSED_VARIABLE(num_cols); - } -}; - -template<typename Indices, typename LhsXprType, typename RhsXprType, typename OutputKernelType = const NoOpOutputKernel> -class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType, OutputKernelType>, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar; - typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType, - typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType; - typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested; - typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp( - const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims, - const OutputKernelType& output_kernel = OutputKernelType()) - : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims), - m_output_kernel(output_kernel) {} - - EIGEN_DEVICE_FUNC - const Indices& indices() const { return m_indices; } - - /** \returns the nested expressions */ - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename LhsXprType::Nested>::type& - lhsExpression() const { return m_lhs_xpr; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename RhsXprType::Nested>::type& - rhsExpression() const { return m_rhs_xpr; } - - EIGEN_DEVICE_FUNC - const OutputKernelType& outputKernel() const { return m_output_kernel; } - - protected: - typename LhsXprType::Nested m_lhs_xpr; - typename RhsXprType::Nested m_rhs_xpr; - const Indices m_indices; - const OutputKernelType m_output_kernel; -}; - - -template<typename Derived> -struct TensorContractionEvaluatorBase : internal::no_assignment_operator -{ - typedef typename internal::traits<Derived>::Indices Indices; - typedef typename internal::traits<Derived>::LeftArgType LeftArgType; - typedef typename internal::traits<Derived>::RightArgType RightArgType; - typedef typename internal::traits<Derived>::OutputKernelType OutputKernelType; - typedef typename internal::traits<Derived>::Device Device; - - typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType; - typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef StorageMemory<Scalar, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - IsAligned = true, - PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorEvaluator<LeftArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = true - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - - typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluatorType; - typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluatorType; - - static const int LDims = - internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value; - static const int RDims = - internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value; - static const int ContractDims = internal::array_size<Indices>::value; - static const int NumDims = LDims + RDims - 2 * ContractDims; - - typedef array<Index, ContractDims> contract_t; - typedef array<Index, LDims - ContractDims> left_nocontract_t; - typedef array<Index, RDims - ContractDims> right_nocontract_t; - - typedef DSizes<Index, NumDims> Dimensions; - - EIGEN_STRONG_INLINE - TensorContractionEvaluatorBase(const XprType& op, const Device& device) - : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), - op.lhsExpression(), op.rhsExpression()), device), - m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), - op.rhsExpression(), op.lhsExpression()), device), - m_device(device), - m_output_kernel(op.outputKernel()), - m_result(NULL) { - EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == - static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), - YOU_MADE_A_PROGRAMMING_MISTAKE); - - - DSizes<Index, LDims> eval_left_dims; - DSizes<Index, RDims> eval_right_dims; - array<IndexPair<Index>, ContractDims> eval_op_indices; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - // For ColMajor, we keep using the existing dimensions - for (int i = 0; i < LDims; i++) { - eval_left_dims[i] = m_leftImpl.dimensions()[i]; - } - for (int i = 0; i < RDims; i++) { - eval_right_dims[i] = m_rightImpl.dimensions()[i]; - } - // We keep the pairs of contracting indices. - for (int i = 0; i < ContractDims; i++) { - eval_op_indices[i].first = op.indices()[i].first; - eval_op_indices[i].second = op.indices()[i].second; - } - } else { - // For RowMajor, we need to reverse the existing dimensions - for (int i = 0; i < LDims; i++) { - eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1]; - } - for (int i = 0; i < RDims; i++) { - eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1]; - } - // We need to flip all the pairs of contracting indices as well as - // reversing the dimensions. - for (int i = 0; i < ContractDims; i++) { - eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second; - eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first; - } - } - - // Check for duplicate axes and make sure the first index in eval_op_indices - // is increasing. Using O(n^2) sorting is OK since ContractDims is small - for (int i = 0; i < ContractDims; i++) { - for (int j = i + 1; j < ContractDims; j++) { - eigen_assert(eval_op_indices[j].first != eval_op_indices[i].first && - eval_op_indices[j].second != eval_op_indices[i].second && - "contraction axes should be unique"); - if (eval_op_indices[j].first < eval_op_indices[i].first) { - numext::swap(eval_op_indices[j], eval_op_indices[i]); - } - } - } - - array<Index, LDims> lhs_strides; - lhs_strides[0] = 1; - for (int i = 0; i < LDims-1; ++i) { - lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i]; - } - - array<Index, RDims> rhs_strides; - rhs_strides[0] = 1; - for (int i = 0; i < RDims-1; ++i) { - rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i]; - } - - if (m_i_strides.size() > 0) m_i_strides[0] = 1; - if (m_j_strides.size() > 0) m_j_strides[0] = 1; - if (m_k_strides.size() > 0) m_k_strides[0] = 1; - - m_i_size = 1; - m_j_size = 1; - m_k_size = 1; - - // To compute the dimension, we simply concatenate the non-contracting - // dimensions of the left and then the right tensor. Additionally, we also - // compute the strides corresponding to the left non-contracting - // dimensions and right non-contracting dimensions. - m_lhs_inner_dim_contiguous = true; - int dim_idx = 0; - Index nocontract_idx = 0; - - for (int i = 0; i < LDims; i++) { - // find if we are contracting on index i of left tensor - bool contracting = false; - for (int j = 0; j < ContractDims; j++) { - if (eval_op_indices[j].first == i) { - contracting = true; - break; - } - } - if (!contracting) { - // add dimension size to output dimensions - m_dimensions[dim_idx] = eval_left_dims[i]; - m_left_nocontract_strides[nocontract_idx] = lhs_strides[i]; - if (dim_idx != i) { - m_lhs_inner_dim_contiguous = false; - } - if (nocontract_idx+1 < internal::array_size<left_nocontract_t>::value) { - m_i_strides[nocontract_idx+1] = - m_i_strides[nocontract_idx] * eval_left_dims[i]; - } else { - m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i]; - } - dim_idx++; - nocontract_idx++; - } - } - - nocontract_idx = 0; - for (int i = 0; i < RDims; i++) { - bool contracting = false; - // find if we are contracting on index i of right tensor - for (int j = 0; j < ContractDims; j++) { - if (eval_op_indices[j].second == i) { - contracting = true; - break; - } - } - if (!contracting) { - m_dimensions[dim_idx] = eval_right_dims[i]; - if (nocontract_idx+1 < internal::array_size<right_nocontract_t>::value) { - m_j_strides[nocontract_idx+1] = - m_j_strides[nocontract_idx] * eval_right_dims[i]; - } else { - m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i]; - } - m_right_nocontract_strides[nocontract_idx] = rhs_strides[i]; - dim_idx++; - nocontract_idx++; - } - } - - // Now compute the strides corresponding to the contracting dimensions. We - // assumed above that non-contracting axes are represented in the same order - // in the matrix as they are in the tensor. This is not the case for - // contracting axes. As the contracting axes must be of the same size in - // each tensor, we'll only look at the first tensor here. - m_rhs_inner_dim_contiguous = true; - m_rhs_inner_dim_reordered = false; - for (int i = 0; i < ContractDims; i++) { - Index left = eval_op_indices[i].first; - Index right = eval_op_indices[i].second; - - Index size = eval_left_dims[left]; - eigen_assert(size == eval_right_dims[right] && - "Contraction axes must be same size"); - - if (i+1 < static_cast<int>(internal::array_size<contract_t>::value)) { - m_k_strides[i+1] = m_k_strides[i] * size; - } else { - m_k_size = m_k_strides[i] * size; - } - m_left_contracting_strides[i] = lhs_strides[left]; - m_right_contracting_strides[i] = rhs_strides[right]; - - if (i > 0 && right < eval_op_indices[i-1].second) { - m_rhs_inner_dim_reordered = true; - } - if (right != i) { - m_rhs_inner_dim_contiguous = false; - } - } - - // If the layout is RowMajor, we need to reverse the m_dimensions - if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) { - for (int i = 0, j = NumDims - 1; i < j; i++, j--) { - numext::swap(m_dimensions[i], m_dimensions[j]); - } - } - - // A set of parameters that will allow output kernel to get from output - // tensor dimensions (i, j) into the original tensor dimensions. - // TODO(ezhulenev): Add parameters required to infer output tensor index for - // more complex contractions than 2x2 on internal dimension. - m_tensor_contraction_params.swapped_arguments = static_cast<int>(Layout) == RowMajor; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { - m_leftImpl.evalSubExprsIfNeeded(NULL); - m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { - evalTo(data); - return false; - } else { - m_result = static_cast<EvaluatorPointerType>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); - evalTo(m_result); - return true; - } - } - -#ifdef EIGEN_USE_THREADS - template <typename EvalSubExprsCallback> - EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType dest, EvalSubExprsCallback done) { - m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) { - m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) { - if (dest) { - evalToAsync(dest, [done]() { done(false); }); - } else { - m_result = static_cast<EvaluatorPointerType>( - m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); - evalToAsync(m_result, [done]() { done(true); }); - } - }); - }); - } -#endif // EIGEN_USE_THREADS - -#ifndef TENSOR_CONTRACTION_DISPATCH -#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \ - if (this->m_lhs_inner_dim_contiguous) { \ - if (this->m_rhs_inner_dim_contiguous) { \ - if (this->m_rhs_inner_dim_reordered) { \ - METHOD<true, true, true, ALIGNMENT> ARGS; \ - } else { \ - METHOD<true, true, false, ALIGNMENT> ARGS; \ - } \ - } else { \ - if (this->m_rhs_inner_dim_reordered) { \ - METHOD<true, false, true, ALIGNMENT> ARGS; \ - } else { \ - METHOD<true, false, false, ALIGNMENT> ARGS; \ - } \ - } \ - } else { \ - if (this->m_rhs_inner_dim_contiguous) { \ - if (this->m_rhs_inner_dim_reordered) { \ - METHOD<false, true, true, ALIGNMENT> ARGS; \ - } else { \ - METHOD<false, true, false, ALIGNMENT> ARGS; \ - } \ - } else { \ - if (this->m_rhs_inner_dim_reordered) { \ - METHOD<false, false, true, ALIGNMENT> ARGS; \ - } else { \ - METHOD<false, false, false, ALIGNMENT> ARGS; \ - } \ - } \ - } -#endif - -#ifndef TENSOR_CONTRACTION_ASYNC_DISPATCH -#define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \ - if (this->m_lhs_inner_dim_contiguous) { \ - if (this->m_rhs_inner_dim_contiguous) { \ - if (this->m_rhs_inner_dim_reordered) { \ - (new METHOD<DONE, true, true, true, ALIGNMENT> ARGS)->FN; \ - } else { \ - (new METHOD<DONE, true, true, false, ALIGNMENT> ARGS)->FN; \ - } \ - } else { \ - if (this->m_rhs_inner_dim_reordered) { \ - (new METHOD<DONE, true, false, true, ALIGNMENT> ARGS)->FN; \ - } else { \ - (new METHOD<DONE, true, false, false, ALIGNMENT> ARGS)->FN; \ - } \ - } \ - } else { \ - if (this->m_rhs_inner_dim_contiguous) { \ - if (this->m_rhs_inner_dim_reordered) { \ - (new METHOD<DONE, false, true, true, ALIGNMENT> ARGS)->FN; \ - } else { \ - (new METHOD<DONE, false, true, false, ALIGNMENT> ARGS)->FN; \ - } \ - } else { \ - if (this->m_rhs_inner_dim_reordered) { \ - (new METHOD<DONE, false, false, true, ALIGNMENT> ARGS)->FN; \ - } else { \ - (new METHOD<DONE, false, false, false, ALIGNMENT> ARGS)->FN; \ - } \ - } \ - } -#endif - - EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { - static_cast<const Derived*>(this)->template evalProduct<Unaligned>(buffer); - } - -#ifdef EIGEN_USE_THREADS - template <typename EvalToCallback> - void evalToAsync(Scalar* buffer, EvalToCallback done) const { - static_cast<const Derived*>(this) - ->template evalProductAsync<EvalToCallback, Unaligned>(buffer, - std::move(done)); - } -#endif // EIGEN_USE_THREADS - - template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, - bool rhs_inner_dim_reordered, int Alignment> - void evalProductSequential(Scalar* buffer) const { - if (this->m_j_size == 1) { - this->template evalGemv<lhs_inner_dim_contiguous, - rhs_inner_dim_contiguous, rhs_inner_dim_reordered, - Alignment>(buffer); - } else { - this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, - rhs_inner_dim_reordered, Alignment>(buffer); - } - } - - template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> - #if !defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif - void evalGemv(Scalar* buffer) const { - const Index rows = m_i_size; - const Index cols = m_k_size; - - typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; - typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; - typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; - typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; - const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size; - const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size; - const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned; - const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned; - typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, - LeftEvaluator, left_nocontract_t, - contract_t, lhs_packet_size, - lhs_inner_dim_contiguous, - false, lhs_alignment> LhsMapper; - - typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, - RightEvaluator, right_nocontract_t, - contract_t, rhs_packet_size, - rhs_inner_dim_contiguous, - rhs_inner_dim_reordered, rhs_alignment> RhsMapper; - - LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides, - m_left_contracting_strides, m_k_strides); - RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides, - m_right_contracting_strides, m_k_strides); - - const Scalar alpha(1); - const Index resIncr(1); - - // zero out the result buffer (which must be of size at least rows * sizeof(Scalar) - m_device.memset(buffer, 0, rows * sizeof(Scalar)); - - internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run( - rows, cols, lhs, rhs, - buffer, resIncr, alpha); - - typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; - m_output_kernel(OutputMapper(buffer, rows), m_tensor_contraction_params, - static_cast<Index>(0), static_cast<Index>(0), rows, - static_cast<Index>(1)); - } - - template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> - #if !defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif - void evalGemm(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - this->template evalGemmPartial<lhs_inner_dim_contiguous, - rhs_inner_dim_contiguous, - rhs_inner_dim_reordered, - Alignment, true>(buffer, 0, k, 1); - } - - template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, - bool rhs_inner_dim_reordered, int Alignment> - EIGEN_DEVICE_FUNC void evalGemmPartialWithoutOutputKernel( - Scalar* buffer, Index k_start, Index k_end, int num_threads) const { - evalGemmPartial<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, - rhs_inner_dim_reordered, Alignment, - /*use_output_kernel*/ false>(buffer, k_start, k_end, - num_threads); - } - - template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment, bool use_output_kernel> - EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const { - eigen_assert(k_end >= k_start && k_start >= 0 && k_end <= this->m_k_size); - // columns in slice on left side, rows on right side - const Index k_slice = k_end - k_start; - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - // define data mappers for Lhs and Rhs - typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; - typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; - - typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; - typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; - - const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size; - const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size; - - typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, - LeftEvaluator, left_nocontract_t, - contract_t, lhs_packet_size, - lhs_inner_dim_contiguous, - false, Unaligned> LhsMapper; - - typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, - RightEvaluator, right_nocontract_t, - contract_t, rhs_packet_size, - rhs_inner_dim_contiguous, - rhs_inner_dim_reordered, Unaligned> RhsMapper; - - typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; - - typedef internal::TensorContractionKernel< - Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper> - TensorContractionKernel; - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - - OutputMapper output(buffer, m); - - // Sizes of the blocks to load in cache. See the Goto paper for details. - internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, - Index, internal::ShardByCol> - blocking(k_slice, m, n, num_threads); - const Index kc = blocking.kc(); - const Index mc = numext::mini(m, blocking.mc()); - const Index nc = numext::mini(n, blocking.nc()); - - typedef typename TensorContractionKernel::LhsBlock LhsBlock; - typedef typename TensorContractionKernel::RhsBlock RhsBlock; - - LhsBlock blockA; - RhsBlock blockB; - - TensorContractionKernel kernel(m, k_slice, n, mc, kc, nc); - - typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle; - const BlockMemHandle packed_mem = - kernel.allocate(this->m_device, &blockA, &blockB); - - // If a contraction kernel does not support beta, explicitly initialize - // output buffer with zeroes. - if (!TensorContractionKernel::HasBeta) { - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - } - - for(Index i2=0; i2<m; i2+=mc) - { - const Index actual_mc = numext::mini(i2+mc,m)-i2; - for (Index k2 = k_start; k2 < k_end; k2 += kc) { - // make sure we don't overshoot right edge of left matrix, then pack vertical panel - const Index actual_kc = numext::mini(k2 + kc, k_end) - k2; - kernel.packLhs(&blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); - - // If kernel supports beta, there is no need to initialize output - // buffer with zeroes. - const Scalar alpha = Scalar(1); - const Scalar beta = (TensorContractionKernel::HasBeta && k2 == k_start) - ? Scalar(0) - : Scalar(1); - - // series of horizontal blocks - for (Index j2 = 0; j2 < n; j2 += nc) { - // make sure we don't overshoot right edge of right matrix, then pack block - const Index actual_nc = numext::mini(j2 + nc, n) - j2; - kernel.packRhs(&blockB, rhs.getSubMapper(k2, j2), actual_kc, - actual_nc); - - // call gebp (matrix kernel) - // The parameters here are copied from Eigen's GEMM implementation - const OutputMapper output_mapper = output.getSubMapper(i2, j2); - kernel.invoke(output_mapper, blockA, blockB, actual_mc, actual_kc, - actual_nc, alpha, beta); - - // We are done with this [i2, j2] output block. - if (use_output_kernel && k2 + kc >= k_end) { - m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2, - actual_mc, actual_nc); - } - } - } - } - - kernel.deallocate(this->m_device, packed_mem); - } - - EIGEN_STRONG_INLINE void cleanup() { - m_leftImpl.cleanup(); - m_rightImpl.cleanup(); - - if (m_result != NULL) { - m_device.deallocate(m_result); - m_result = NULL; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_result[index]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0); - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return internal::ploadt<PacketReturnType, LoadMode>(m_result + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; } - -protected: - Dimensions m_dimensions; - - contract_t m_k_strides; - contract_t m_left_contracting_strides; - contract_t m_right_contracting_strides; - - bool m_lhs_inner_dim_contiguous; - bool m_rhs_inner_dim_contiguous; - bool m_rhs_inner_dim_reordered; - - left_nocontract_t m_i_strides; - right_nocontract_t m_j_strides; - left_nocontract_t m_left_nocontract_strides; - right_nocontract_t m_right_nocontract_strides; - - Index m_i_size; - Index m_j_size; - Index m_k_size; - - TensorContractionParams m_tensor_contraction_params; - - TensorEvaluator<EvalLeftArgType, Device> m_leftImpl; - TensorEvaluator<EvalRightArgType, Device> m_rightImpl; - const Device EIGEN_DEVICE_REF m_device; - OutputKernelType m_output_kernel; - EvaluatorPointerType m_result; -}; - - -// evaluator for default device -template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType, typename Device> -struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> : - public TensorContractionEvaluatorBase< - TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> > { - typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self; - typedef TensorContractionEvaluatorBase<Self> Base; - - typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType; - typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - - enum { - Layout = TensorEvaluator<LeftArgType, Device>::Layout - }; - - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - - static const int LDims = - internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value; - static const int RDims = - internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value; - static const int ContractDims = internal::array_size<Indices>::value; - - typedef array<Index, ContractDims> contract_t; - typedef array<Index, LDims - ContractDims> left_nocontract_t; - typedef array<Index, RDims - ContractDims> right_nocontract_t; - - static const int NumDims = LDims + RDims - 2 * ContractDims; - - // Could we use NumDimensions here? - typedef DSizes<Index, NumDims> Dimensions; - - TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) { } - - template <int Alignment> - void evalProduct(Scalar* buffer) const { - TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Alignment, (buffer)); - } -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionBlocking.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionBlocking.h deleted file mode 100644 index 974feb0..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionBlocking.h +++ /dev/null @@ -1,73 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H - - -namespace Eigen { -namespace internal { - -enum { - ShardByRow = 0, - ShardByCol = 1 -}; - - -// Default Blocking Strategy -template<typename ResScalar, typename LhsScalar, typename RhsScalar, typename StorageIndex, int ShardingType = ShardByCol> -class TensorContractionBlocking { - public: - - /* - adding EIGEN_DEVICE_FUNC unconditionally to 'TensorContractionBlocking' constructor in `TensorContractionBlocking.h` - requires adding EIGEN_DEVICE_FUNC to `computeProductBlockingSizes` in `GeneralBlockPanelKernel.h` - which in turn, requires adding EIGEN_DEVICE_FUNC to `evaluateProductBlockingSizesHeuristic` in `GeneralBlockPanelKernel.h` - which in turn, requires adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h` - (else HIPCC will error out) - - However adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h` - results in NVCC erroring out with the following error - - ../Eigen/src/Core/products/GeneralBlockPanelKernel.h(57): error #2901: - dynamic initialization is not supported for function-scope static variables within a __device__/__global__ function - */ - - #if !defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif - TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, StorageIndex num_threads = 1) : - kc_(k), mc_(m), nc_(n) - { - if (ShardingType == ShardByCol) { - computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, mc_, nc_, num_threads); - } - else { - computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, nc_, mc_, num_threads); - } - - const int rhs_packet_size = internal::packet_traits<RhsScalar>::size; - kc_ = (rhs_packet_size <= 8 || kc_ <= rhs_packet_size) ? - kc_ : (kc_ / rhs_packet_size) * rhs_packet_size; - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; } - - private: - StorageIndex kc_; - StorageIndex mc_; - StorageIndex nc_; -}; - -} // end namespace internal -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionCuda.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionCuda.h deleted file mode 100644 index 3f315fe..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionCuda.h +++ /dev/null @@ -1,6 +0,0 @@ - -#if defined(__clang__) || defined(__GNUC__) -#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorContractionGpu.h file" -#endif - -#include "TensorContractionGpu.h" diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionGpu.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionGpu.h deleted file mode 100644 index c818038..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionGpu.h +++ /dev/null @@ -1,1413 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com> -// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com> -// Copyright (C) 2014 Eric Martin <eric@ericmart.in> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H - -#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) - -namespace Eigen { - -template<typename Scalar, typename Index, typename LhsMapper, - typename RhsMapper, typename OutputMapper, bool needs_edge_check> -__device__ EIGEN_STRONG_INLINE void -EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem, - const Index m_size, const Index n_size, const Index k_size) { - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - // declare and initialize 64 registers for output 8x8 block - - // prefetch registers - Scalar lhs_pf0; - Scalar lhs_pf1; - Scalar lhs_pf2; - Scalar lhs_pf3; - Scalar lhs_pf4; - Scalar lhs_pf5; - Scalar lhs_pf6; - Scalar lhs_pf7; - - Scalar rhs_pf0; - Scalar rhs_pf1; - Scalar rhs_pf2; - Scalar rhs_pf3; - Scalar rhs_pf4; - Scalar rhs_pf5; - Scalar rhs_pf6; - Scalar rhs_pf7; - - // shared memory is formatted - // (contract idx in block, nocontract idx in block, block idx) - // where block idx is column major. This transposition limits the number of - // bank conflicts when reading the LHS. The core idea is that since the contracting - // index is shared by both sides, then the contracting index should be in threadIdx.x. - - // On the LHS, we pad each row inside of each block with an extra element. This makes - // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts - // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. - - // On the RHS we just add 8 padding elements to the end of each block. This gives no bank - // conflicts on writes and also none on reads. - - // storage indices - const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z; - const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x; - - const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; - const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; - const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; - const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; - const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; - const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; - const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; - const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; - - const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; - const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; - const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; - const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; - const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; - const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; - const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; - const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; - - // in the loading code, the following variables are important: - // threadIdx.x: the vertical position in an 8x8 block - // threadIdx.y: the vertical index of the 8x8 block in the grid - // threadIdx.z: the horizontal position in an 8x8 block - // k: the horizontal index of the 8x8 block in the grid - // - // The k parameter is implicit (it was the loop counter for a loop that went - // from 0 to <8, but now that loop is unrolled in the below code. - - const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y; - const Index lhs_vert = base_m + load_idx_vert; - -#define prefetchIntoRegisters(base_k) \ - { \ - lhs_pf0 = conv(0); \ - lhs_pf1 = conv(0); \ - lhs_pf2 = conv(0); \ - lhs_pf3 = conv(0); \ - lhs_pf4 = conv(0); \ - lhs_pf5 = conv(0); \ - lhs_pf6 = conv(0); \ - lhs_pf7 = conv(0); \ - \ - rhs_pf0 = conv(0); \ - rhs_pf1 = conv(0); \ - rhs_pf2 = conv(0); \ - rhs_pf3 = conv(0); \ - rhs_pf4 = conv(0); \ - rhs_pf5 = conv(0); \ - rhs_pf6 = conv(0); \ - rhs_pf7 = conv(0); \ - \ - if (!needs_edge_check || lhs_vert < m_size) { \ - const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \ - const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \ - const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \ - const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \ - const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \ - const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \ - const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \ - const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \ - \ - if (!needs_edge_check || lhs_horiz_7 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ - lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ - lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ - } else if (lhs_horiz_6 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ - lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ - } else if (lhs_horiz_5 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ - } else if (lhs_horiz_4 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - } else if (lhs_horiz_3 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - } else if (lhs_horiz_2 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - } else if (lhs_horiz_1 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - } else if (lhs_horiz_0 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - } \ - } \ - \ - const Index rhs_vert = base_k + load_idx_vert; \ - if (!needs_edge_check || rhs_vert < k_size) { \ - const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \ - const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \ - const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \ - const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \ - const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \ - const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \ - const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \ - const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \ - \ - if (rhs_horiz_7 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ - rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ - rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ - } else if (rhs_horiz_6 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ - rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ - } else if (rhs_horiz_5 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ - } else if (rhs_horiz_4 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - } else if (rhs_horiz_3 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - } else if (rhs_horiz_2 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - } else if (rhs_horiz_1 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - } else if (rhs_horiz_0 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - } \ - } \ - } \ - -#define writeRegToShmem(_) \ - lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ - rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ - \ - lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ - rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ - \ - lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ - rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ - \ - lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ - rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ - \ - lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ - rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ - \ - lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ - rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ - \ - lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ - rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ - \ - lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ - rhs_shmem[rhs_store_idx_7] = rhs_pf7; \ - - // declare and initialize result array -#define res(i, j) _res_##i##j -#define initResultRow(i) \ - Scalar res(i, 0) = conv(0); \ - Scalar res(i, 1) = conv(0); \ - Scalar res(i, 2) = conv(0); \ - Scalar res(i, 3) = conv(0); \ - Scalar res(i, 4) = conv(0); \ - Scalar res(i, 5) = conv(0); \ - Scalar res(i, 6) = conv(0); \ - Scalar res(i, 7) = conv(0); \ - - internal::scalar_cast_op<int, Scalar> conv; - initResultRow(0); - initResultRow(1); - initResultRow(2); - initResultRow(3); - initResultRow(4); - initResultRow(5); - initResultRow(6); - initResultRow(7); -#undef initResultRow - - for (Index base_k = 0; base_k < k_size; base_k += 64) { - // wait for previous iteration to finish with shmem. Despite common sense, - // the code is a bit faster with this here then at bottom of loop - __syncthreads(); - - prefetchIntoRegisters(base_k); - writeRegToShmem(); - - #undef prefetchIntoRegisters - #undef writeRegToShmem - - // wait for shared mem packing to be done before starting computation - __syncthreads(); - - // compute 8x8 matrix product by outer product. This involves packing one column - // of LHS and one row of RHS into registers (takes 16 registers). - -#define lcol(i) _lcol##i - Scalar lcol(0); - Scalar lcol(1); - Scalar lcol(2); - Scalar lcol(3); - Scalar lcol(4); - Scalar lcol(5); - Scalar lcol(6); - Scalar lcol(7); - -#define rrow(j) _rrow##j - Scalar rrow(0); - Scalar rrow(1); - Scalar rrow(2); - Scalar rrow(3); - Scalar rrow(4); - Scalar rrow(5); - Scalar rrow(6); - Scalar rrow(7); - - // Now x corresponds to k, y to m, and z to n - const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; - const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; - -#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] -#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] - -#define loadData(i, j) \ - lcol(0) = lhs_element(0, j); \ - rrow(0) = rhs_element(i, 0); \ - lcol(1) = lhs_element(1, j); \ - rrow(1) = rhs_element(i, 1); \ - lcol(2) = lhs_element(2, j); \ - rrow(2) = rhs_element(i, 2); \ - lcol(3) = lhs_element(3, j); \ - rrow(3) = rhs_element(i, 3); \ - lcol(4) = lhs_element(4, j); \ - rrow(4) = rhs_element(i, 4); \ - lcol(5) = lhs_element(5, j); \ - rrow(5) = rhs_element(i, 5); \ - lcol(6) = lhs_element(6, j); \ - rrow(6) = rhs_element(i, 6); \ - lcol(7) = lhs_element(7, j); \ - rrow(7) = rhs_element(i, 7); \ - -#define computeCol(j) \ - res(0, j) += lcol(0) * rrow(j); \ - res(1, j) += lcol(1) * rrow(j); \ - res(2, j) += lcol(2) * rrow(j); \ - res(3, j) += lcol(3) * rrow(j); \ - res(4, j) += lcol(4) * rrow(j); \ - res(5, j) += lcol(5) * rrow(j); \ - res(6, j) += lcol(6) * rrow(j); \ - res(7, j) += lcol(7) * rrow(j); \ - -#define computePass(i) \ - loadData(i, i); \ - \ - computeCol(0); \ - computeCol(1); \ - computeCol(2); \ - computeCol(3); \ - computeCol(4); \ - computeCol(5); \ - computeCol(6); \ - computeCol(7); \ - - computePass(0); - computePass(1); - computePass(2); - computePass(3); - computePass(4); - computePass(5); - computePass(6); - computePass(7); - -#undef lcol -#undef rrow -#undef lhs_element -#undef rhs_element -#undef loadData -#undef computeCol -#undef computePass - } // end loop over k - - // we've now iterated over all of the large (ie width 64) k blocks and - // accumulated results in registers. At this point thread (x, y, z) contains - // the sum across all big k blocks of the product of little k block of index (x, y) - // with block of index (y, z). To compute the final output, we need to reduce - // the 8 threads over y by summation. -#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000) -#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) -#else -#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask) -#endif - -#define reduceRow(i, mask) \ - shuffleInc(i, 0, mask); \ - shuffleInc(i, 1, mask); \ - shuffleInc(i, 2, mask); \ - shuffleInc(i, 3, mask); \ - shuffleInc(i, 4, mask); \ - shuffleInc(i, 5, mask); \ - shuffleInc(i, 6, mask); \ - shuffleInc(i, 7, mask); \ - -#define reduceMatrix(mask) \ - reduceRow(0, mask); \ - reduceRow(1, mask); \ - reduceRow(2, mask); \ - reduceRow(3, mask); \ - reduceRow(4, mask); \ - reduceRow(5, mask); \ - reduceRow(6, mask); \ - reduceRow(7, mask); \ - - // actually perform the reduction, now each thread of index (_, y, z) - // contains the correct values in its registers that belong in the output - // block - reduceMatrix(1); - reduceMatrix(2); - reduceMatrix(4); - -#undef shuffleInc -#undef reduceRow -#undef reduceMatrix - - // now we need to copy the 64 values into main memory. We can't split work - // among threads because all variables are in registers. There's 2 ways - // to do this: - // (1) have 1 thread do 64 writes from registers into global memory - // (2) have 1 thread do 64 writes into shared memory, and then 8 threads - // each do 8 writes into global memory. We can just overwrite the shared - // memory from the problem we just solved. - // (2) is slightly faster than (1) due to less branching and more ILP - - // TODO: won't yield much gain, but could just use currently unused shared mem - // and then we won't have to sync - // wait for shared mem to be out of use - __syncthreads(); - -#define writeResultShmem(i, j) \ - lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \ - -#define writeRow(i) \ - writeResultShmem(i, 0); \ - writeResultShmem(i, 1); \ - writeResultShmem(i, 2); \ - writeResultShmem(i, 3); \ - writeResultShmem(i, 4); \ - writeResultShmem(i, 5); \ - writeResultShmem(i, 6); \ - writeResultShmem(i, 7); \ - - if (threadIdx.x == 0) { - writeRow(0); - writeRow(1); - writeRow(2); - writeRow(3); - writeRow(4); - writeRow(5); - writeRow(6); - writeRow(7); - } -#undef writeResultShmem -#undef writeRow - - const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); - const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); - - if (threadIdx.x < max_i_write) { - if (max_j_write == 8) { - // TODO: can i trade bank conflicts for coalesced writes? - Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0]; - Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1]; - Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2]; - Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3]; - Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4]; - Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5]; - Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6]; - Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7]; - - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7; - } else { -#pragma unroll 7 - for (int j = 0; j < max_j_write; j++) { - Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j]; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val; - } - } - } -#undef res -} - - -template<typename Scalar, typename Index, typename LhsMapper, - typename RhsMapper, typename OutputMapper> -__global__ void -#if defined(EIGEN_HIPCC) -__launch_bounds__(512, 1) -#else -__launch_bounds__(512) -#endif -EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ Scalar lhs_shmem[72 * 64]; - __shared__ Scalar rhs_shmem[72 * 64]; - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - if (base_m + 63 < m_size && base_n + 63 < n_size) { - EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); - } else { - EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); - } -} - - -template<typename Index, typename LhsMapper, - typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY, - bool CHECK_RHS_BOUNDARY> -__device__ __forceinline__ void -EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, float2 lhs_shmem2[][16], - float2 rhs_shmem2[][8], const Index m_size, - const Index n_size, const Index k_size, - const Index base_m, const Index base_n) { - - // prefetch registers - float4 lhs_pf0, rhs_pf0; - - float4 results[4]; - for (int i=0; i < 4; i++) { - results[i].x = results[i].y = results[i].z = results[i].w = 0; - } - -#define prefetch_lhs(reg, row, col) \ - if (!CHECK_LHS_BOUNDARY) { \ - if (col < k_size) { \ - reg =lhs.template loadPacket<float4,Unaligned>(row, col); \ - } \ - } else { \ - if (col < k_size) { \ - if (row + 3 < m_size) { \ - reg =lhs.template loadPacket<float4,Unaligned>(row, col); \ - } else if (row + 2 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - reg.z =lhs(row + 2, col); \ - } else if (row + 1 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - } else if (row < m_size) { \ - reg.x =lhs(row + 0, col); \ - } \ - } \ - } \ - - Index lhs_vert = base_m+threadIdx.x*4; - - for (Index k = 0; k < k_size; k += 16) { - - lhs_pf0 = internal::pset1<float4>(0); - rhs_pf0 = internal::pset1<float4>(0); - - Index lhs_horiz = threadIdx.y+k; - prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) - - Index rhs_vert = k+(threadIdx.x%4)*4; - Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n; - - if (!CHECK_RHS_BOUNDARY) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0); - } else if (rhs_vert + 2 < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - } else if (rhs_vert + 1 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - } - } else { - if (rhs_horiz0 < n_size) { - if ((rhs_vert + 3) < k_size) { - rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0); - } else if ((rhs_vert + 2) < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - } else if ((rhs_vert + 1) < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - } - } - } - float x1, x2 ; - // the following can be a bitwise operation..... some day. - if((threadIdx.x%8) < 4) { - x1 = rhs_pf0.y; - x2 = rhs_pf0.w; - } else { - x1 = rhs_pf0.x; - x2 = rhs_pf0.z; - } - #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000) - x1 = __shfl_xor(x1, 4); - x2 = __shfl_xor(x2, 4); - #else - x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4); - x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4); - #endif - if((threadIdx.x%8) < 4) { - rhs_pf0.y = x1; - rhs_pf0.w = x2; - } else { - rhs_pf0.x = x1; - rhs_pf0.z = x2; - } - - // We have 64 features. - // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. - // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. - // ... - // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 - // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 - // ... - rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y); - rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w); - - // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) - // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) - // ... - // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) - // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) - // ... - - lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y); - lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w); - - -#define add_vals(fl1, fl2, fr1, fr2)\ - results[0].x += fl1.x * fr1.x;\ - results[0].y += fl1.y * fr1.x;\ - results[0].z += fl2.x * fr1.x;\ - results[0].w += fl2.y * fr1.x;\ -\ - results[1].x += fl1.x * fr1.y;\ - results[1].y += fl1.y * fr1.y;\ - results[1].z += fl2.x * fr1.y;\ - results[1].w += fl2.y * fr1.y;\ -\ - results[2].x += fl1.x * fr2.x;\ - results[2].y += fl1.y * fr2.x;\ - results[2].z += fl2.x * fr2.x;\ - results[2].w += fl2.y * fr2.x;\ -\ - results[3].x += fl1.x * fr2.y;\ - results[3].y += fl1.y * fr2.y;\ - results[3].z += fl2.x * fr2.y;\ - results[3].w += fl2.y * fr2.y;\ - - __syncthreads(); - - // Do the multiplies. - #pragma unroll - for (int koff = 0; koff < 16; koff ++) { - // 32 x threads. - float2 fl1 = lhs_shmem2[koff][threadIdx.x]; - float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x]; - - int start_feature = threadIdx.y * 4; - float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; - float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; - - add_vals(fl1, fl2, fr1, fr2) - } - __syncthreads(); - } - -#undef prefetch_lhs -#undef add_vals - - Index horiz_base = threadIdx.y*4+base_n; - if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (!CHECK_RHS_BOUNDARY) { - // CHECK LHS - if (lhs_vert + 3 < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (lhs_vert + 2 < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - } - } else if (lhs_vert + 1 < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - } - } else if (lhs_vert < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - } - } - } else if (!CHECK_LHS_BOUNDARY) { - // CHECK RHS - /* - int ncols_rem = fminf(n_size- horiz_base, 4); - for (int i = 0; i < ncols_rem; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - }*/ - for (int i = 0; i < 4; i++) { - if (horiz_base+i < n_size) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } else { - // CHECK both boundaries. - for (int i = 0; i < 4; i++) { - if (horiz_base+i < n_size) { - if (lhs_vert < m_size) - output(lhs_vert, horiz_base + i) = results[i].x; - if (lhs_vert + 1 < m_size) - output(lhs_vert + 1, horiz_base + i) = results[i].y; - if (lhs_vert + 2 < m_size) - output(lhs_vert + 2, horiz_base + i) = results[i].z; - if (lhs_vert + 3 < m_size) - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } -} - - -template<typename Index, typename LhsMapper, - typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY, - bool CHECK_RHS_BOUNDARY> -__device__ __forceinline__ void -EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, float2 lhs_shmem2[][32], - float2 rhs_shmem2[][8], const Index m_size, - const Index n_size, const Index k_size, - const Index base_m, const Index base_n) { - - // prefetch registers - float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; - float4 rhs_pf0, rhs_pf1; - - float4 results[8]; - for (int i=0; i < 8; i++) { - results[i].x = results[i].y = results[i].z = results[i].w = 0; - } - - Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32; - for (Index k = 0; k < k_size; k += 32) { - lhs_pf0 = internal::pset1<float4>(0); - lhs_pf1 = internal::pset1<float4>(0); - lhs_pf2 = internal::pset1<float4>(0); - lhs_pf3 = internal::pset1<float4>(0); - - rhs_pf0 = internal::pset1<float4>(0); - rhs_pf1 = internal::pset1<float4>(0); - - if (!CHECK_LHS_BOUNDARY) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k)); - } - } else { - // just CHECK_LHS_BOUNDARY - if (lhs_vert + 3 < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k)); - } - } else if (lhs_vert + 2 < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); - lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); - lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - } - } else if (lhs_vert + 1 < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); - lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - } - } else if (lhs_vert < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - } - } - } - __syncthreads(); - Index rhs_vert = k+threadIdx.x*4; - Index rhs_horiz0 = threadIdx.y*2+base_n; - Index rhs_horiz1 = threadIdx.y*2+1+base_n; - if (!CHECK_RHS_BOUNDARY) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1); - } else if (rhs_vert + 2 < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); - } else if (rhs_vert + 1 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - } - } else { - if (rhs_horiz1 < n_size) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1); - } else if (rhs_vert + 2 < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); - } else if (k+threadIdx.x*4 + 1 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - } else if (k+threadIdx.x*4 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - } - } else if (rhs_horiz0 < n_size) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0); - } else if ((rhs_vert + 2) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - } else if ((rhs_vert + 1) < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - } - } - } - __syncthreads(); - // Loaded. Do computation - // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. - // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. - // .. - // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 - rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x); - // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. - // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. - // .. - rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y); - // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. - // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. - rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z); - // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. - // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. - rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w); - - // LHS. - // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) - // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) - // ... - // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) - // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) - - -#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\ - results[0].x += a_feat1.x * f1.x;\ - results[1].x += a_feat1.x * f1.y;\ - results[2].x += a_feat1.x * f2.x;\ - results[3].x += a_feat1.x * f2.y;\ - results[4].x += a_feat1.x * f3.x;\ - results[5].x += a_feat1.x * f3.y;\ - results[6].x += a_feat1.x * f4.x;\ - results[7].x += a_feat1.x * f4.y;\ -\ - results[0].y += a_feat1.y * f1.x;\ - results[1].y += a_feat1.y * f1.y;\ - results[2].y += a_feat1.y * f2.x;\ - results[3].y += a_feat1.y * f2.y;\ - results[4].y += a_feat1.y * f3.x;\ - results[5].y += a_feat1.y * f3.y;\ - results[6].y += a_feat1.y * f4.x;\ - results[7].y += a_feat1.y * f4.y;\ -\ - results[0].z += a_feat2.x * f1.x;\ - results[1].z += a_feat2.x * f1.y;\ - results[2].z += a_feat2.x * f2.x;\ - results[3].z += a_feat2.x * f2.y;\ - results[4].z += a_feat2.x * f3.x;\ - results[5].z += a_feat2.x * f3.y;\ - results[6].z += a_feat2.x * f4.x;\ - results[7].z += a_feat2.x * f4.y;\ -\ - results[0].w += a_feat2.y * f1.x;\ - results[1].w += a_feat2.y * f1.y;\ - results[2].w += a_feat2.y * f2.x;\ - results[3].w += a_feat2.y * f2.y;\ - results[4].w += a_feat2.y * f3.x;\ - results[5].w += a_feat2.y * f3.y;\ - results[6].w += a_feat2.y * f4.x;\ - results[7].w += a_feat2.y * f4.y;\ - - lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y); - lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y); - lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y); - lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y); - - lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w); - lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w); - lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w); - lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w); - - __syncthreads(); - - // Do the multiplies. - #pragma unroll - for (int koff = 0; koff < 32; koff ++) { - float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8]; - float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8]; - - // first feature is at (threadIdx.y/4) * 8 last is at start + 8. - int start_feature = (threadIdx.y / 4) * 8; - - float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4]; - float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4]; - float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4]; - float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4]; - - add_vals(a3, a4, br1, br2, br3, br4) - } - __syncthreads(); - } // end loop over k - - __syncthreads(); - Index horiz_base = (threadIdx.y/4)*8+base_n; - if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (!CHECK_RHS_BOUNDARY) { - if (lhs_vert + 3 < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (lhs_vert + 2 < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - } - } else if (lhs_vert + 1 < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - } - } else if (lhs_vert < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - } - } - } else if (!CHECK_LHS_BOUNDARY) { - // CHECK BOUNDARY_B - for (int i = 0; i < 8; i++) { - if (horiz_base + i < n_size) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } else { - // CHECK both boundaries. - for (int i = 0; i < 8; i++) { - if (horiz_base + i < n_size) { - if (lhs_vert < m_size) - output(lhs_vert, horiz_base + i) = results[i].x; - if (lhs_vert + 1 < m_size) - output(lhs_vert + 1, horiz_base + i) = results[i].y; - if (lhs_vert + 2 < m_size) - output(lhs_vert + 2, horiz_base + i) = results[i].z; - if (lhs_vert + 3 < m_size) - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } -} - - -template<typename Index, typename LhsMapper, - typename RhsMapper, typename OutputMapper> -__global__ void -#if defined(EIGEN_HIPCC) -__launch_bounds__(256, 1) -#else -__launch_bounds__(256) -#endif -EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ float2 lhs_shmem[64*32]; - __shared__ float2 rhs_shmem[128*8]; - - typedef float2 LHS_MEM[64][32]; - typedef float2 RHS_MEM[128][8]; - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 128 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - bool check_rhs = (base_n + 63) >= n_size; - bool check_lhs128 = (base_m + 127) >= m_size; - - if (!check_rhs) { - if (!check_lhs128) { - // >= 128 rows left - EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } - } else { - if (!check_lhs128) { - // >= 128 rows left - EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } - } -} - -template<typename Index, typename LhsMapper, - typename RhsMapper, typename OutputMapper> -__global__ void -#if defined(EIGEN_HIPCC) -__launch_bounds__(256, 1) -#else -__launch_bounds__(256) -#endif -EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ float2 lhs_shmem[32][16]; - __shared__ float2 rhs_shmem[64][8]; - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - if (base_m + 63 < m_size) { - if (base_n + 63 < n_size) { - EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } - } else { - if (base_n + 63 < n_size) { - EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } - } -} - - -template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType> -struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> : - public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> > { - - typedef GpuDevice Device; - - typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self; - typedef TensorContractionEvaluatorBase<Self> Base; - - typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType; - typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType; - - enum { - Layout = TensorEvaluator<LeftArgType, Device>::Layout, - }; - - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - - static const int LDims = - internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value; - static const int RDims = - internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value; - static const int ContractDims = internal::array_size<Indices>::value; - - typedef array<Index, LDims> left_dim_mapper_t; - typedef array<Index, RDims> right_dim_mapper_t; - - typedef array<Index, ContractDims> contract_t; - typedef array<Index, LDims - ContractDims> left_nocontract_t; - typedef array<Index, RDims - ContractDims> right_nocontract_t; - - static const int NumDims = LDims + RDims - 2 * ContractDims; - - typedef DSizes<Index, NumDims> Dimensions; - - // typedefs needed in evalTo - typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; - typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; - - typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; - typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; - - typedef typename LeftEvaluator::Dimensions LeftDimensions; - typedef typename RightEvaluator::Dimensions RightDimensions; - - TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) - { - EIGEN_STATIC_ASSERT( (internal::is_same<OutputKernelType, const NoOpOutputKernel>::value), - GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS); - } - - // We need to redefine this method to make nvcc happy - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - this->m_leftImpl.evalSubExprsIfNeeded(NULL); - this->m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { - evalTo(data); - return false; - } else { - this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); - evalTo(this->m_result); - return true; - } - } - - void evalTo(Scalar* buffer) const { - if (this->m_lhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped<true, true, true, Unaligned>(buffer); - } - else { - evalTyped<true, true, false, Unaligned>(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped<true, false, true, Unaligned>(buffer); - } - else { - evalTyped<true, false, false, Unaligned>(buffer); - } - } - } - else { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped<false, true, true, Unaligned>(buffer); - } - else { - evalTyped<false, true, false, Unaligned>(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped<false, false, true, Unaligned>(buffer); - } - else { - evalTyped<false, false, false, Unaligned>(buffer); - } - } - } - } - - template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels { - static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { - const Index m_blocks = (m + 63) / 64; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(8, 8, 8); - LAUNCH_GPU_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); - } - }; - - template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> { - static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { - if (m < 768 || n < 768) { - const Index m_blocks = (m + 63) / 64; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(16, 16, 1); - LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); - } else { - const Index m_blocks = (m + 127) / 128; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(8, 32, 1); - LAUNCH_GPU_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); - } - } - }; - - template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> - void evalTyped(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - EIGEN_UNUSED_VARIABLE(k) - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, - LeftEvaluator, left_nocontract_t, - contract_t, 4, - lhs_inner_dim_contiguous, - false, Unaligned> LhsMapper; - - typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, - RightEvaluator, right_nocontract_t, - contract_t, 4, - rhs_inner_dim_contiguous, - rhs_inner_dim_reordered, Unaligned> RhsMapper; - - typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; - - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - - OutputMapper output(buffer, m); - -#if defined(EIGEN_USE_HIP) - setGpuSharedMemConfig(hipSharedMemBankSizeEightByte); -#else - setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte); -#endif - - LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k, this->m_device); - } -}; - -} // end namespace Eigen - -#endif // EIGEN_USE_GPU and EIGEN_GPUCC -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionMapper.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionMapper.h deleted file mode 100644 index 9ab900b..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionMapper.h +++ /dev/null @@ -1,575 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H - -namespace Eigen { - -namespace internal { - -enum { - Rhs = 0, - Lhs = 1 -}; - -/* - * Implementation of the Eigen blas_data_mapper class for tensors. - */ -/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the default make pointer is used which -/// is scalar * for CoeffLoader. -template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_ = MakePointer> -struct CoeffLoader; - -template <typename Scalar, typename Index, int side, typename Tensor, - typename nocontract_t, typename contract_t, int packet_size, - bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, - template <class> class MakePointer_ = MakePointer> -class BaseTensorContractionMapper; - -template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_> -struct CoeffLoader { - enum { - DirectOffsets = false - }; - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) { } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) { - eigen_assert(false && "unsupported"); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_<const typename Tensor::Scalar>::Type - data() const { - eigen_assert(false && "unsupported"); - return NULL; - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); } - - template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename Tensor::PacketReturnType packet(typename Tensor::Index index) const - { - return m_tensor.template packet<LoadMode>(index); - } - - #ifdef EIGEN_USE_SYCL - // The placeholder accessors require to be bound to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_tensor.bind(cgh); - } - #endif - - private: - const Tensor m_tensor; -}; - -template <typename Tensor, template <class> class MakePointer_> -struct CoeffLoader<Tensor, true, MakePointer_> { - enum { - DirectOffsets = true - }; - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {} - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { - m_data += offset; - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_<const typename Tensor::Scalar>::Type - data() const { - return m_data; - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); } - - template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename Tensor::PacketReturnType packet(typename Tensor::Index index) const - { - return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index); - } - - #ifdef EIGEN_USE_SYCL - // The placeholder accessors require to be bound to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_data.bind(cgh); - } - #endif - private: - typedef typename Tensor::Scalar Scalar; - - typename MakePointer_<const Scalar>::Type m_data; -}; - -template<typename Scalar, typename Index, int side, - typename Tensor, - typename nocontract_t, typename contract_t, - int packet_size, bool inner_dim_contiguous, int Alignment, template <class> class MakePointer_ = MakePointer> -class SimpleTensorContractionMapper { - public: - EIGEN_DEVICE_FUNC - SimpleTensorContractionMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) : - m_tensor(tensor), - m_nocontract_strides(nocontract_strides), - m_ij_strides(ij_strides), - m_contract_strides(contract_strides), - m_k_strides(k_strides) { } - - enum { - DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>::DirectOffsets - }; - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { - m_tensor.offsetBuffer(offset); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar operator()(Index row) const { - // column major assumption - return operator()(row, 0); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const { - return m_tensor.coeff(computeIndex(row, col)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { - const bool left = (side == Lhs); - EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 - Index nocontract_val = left ? row : col; - Index linidx = 0; - EIGEN_UNROLL_LOOP - for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) { - const Index idx = nocontract_val / m_ij_strides[i]; - linidx += idx * m_nocontract_strides[i]; - nocontract_val -= idx * m_ij_strides[i]; - } - if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) { - if (side == Lhs && inner_dim_contiguous) { - eigen_assert(m_nocontract_strides[0] == 1); - linidx += nocontract_val; - } else { - linidx += nocontract_val * m_nocontract_strides[0]; - } - } - - Index contract_val = left ? col : row; - if(array_size<contract_t>::value > 0) { - EIGEN_UNROLL_LOOP - for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) { - const Index idx = contract_val / m_k_strides[i]; - linidx += idx * m_contract_strides[i]; - contract_val -= idx * m_k_strides[i]; - } - - if (side == Rhs && inner_dim_contiguous) { - eigen_assert(m_contract_strides[0] == 1); - linidx += contract_val; - } else { - linidx += contract_val * m_contract_strides[0]; - } - } - - return linidx; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const { - const bool left = (side == Lhs); - EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 - Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; - Index linidx[2] = {0, 0}; - if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) { - EIGEN_UNROLL_LOOP - for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) { - const Index idx0 = nocontract_val[0] / m_ij_strides[i]; - const Index idx1 = nocontract_val[1] / m_ij_strides[i]; - linidx[0] += idx0 * m_nocontract_strides[i]; - linidx[1] += idx1 * m_nocontract_strides[i]; - nocontract_val[0] -= idx0 * m_ij_strides[i]; - nocontract_val[1] -= idx1 * m_ij_strides[i]; - } - if (side == Lhs && inner_dim_contiguous) { - eigen_assert(m_nocontract_strides[0] == 1); - linidx[0] += nocontract_val[0]; - linidx[1] += nocontract_val[1]; - } else { - linidx[0] += nocontract_val[0] * m_nocontract_strides[0]; - linidx[1] += nocontract_val[1] * m_nocontract_strides[0]; - } - } - - Index contract_val[2] = {left ? col : row, left ? col : row + distance}; - if (array_size<contract_t>::value> 0) { - EIGEN_UNROLL_LOOP - for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) { - const Index idx0 = contract_val[0] / m_k_strides[i]; - const Index idx1 = contract_val[1] / m_k_strides[i]; - linidx[0] += idx0 * m_contract_strides[i]; - linidx[1] += idx1 * m_contract_strides[i]; - contract_val[0] -= idx0 * m_k_strides[i]; - contract_val[1] -= idx1 * m_k_strides[i]; - } - - if (side == Rhs && inner_dim_contiguous) { - eigen_assert(m_contract_strides[0] == 1); - linidx[0] += contract_val[0]; - linidx[1] += contract_val[1]; - } else { - linidx[0] += contract_val[0] * m_contract_strides[0]; - linidx[1] += contract_val[1] * m_contract_strides[0]; - } - } - return IndexPair<Index>(linidx[0], linidx[1]); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const { - // Only claim alignment when we can compute the actual stride (ie when we're - // dealing with the lhs with inner_dim_contiguous. This is because the - // matrix-vector product relies on the stride when dealing with aligned inputs. - return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size; - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const { - return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1; - } - - #ifdef EIGEN_USE_SYCL - // The placeholder accessors require to be bound to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_tensor.bind(cgh); - } - #endif - - const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& tensor() const { - return m_tensor; - } - - const nocontract_t& nocontract_strides() const { - return m_nocontract_strides; - } - const nocontract_t& ij_strides() const { return m_ij_strides; } - const contract_t& contract_strides() const { return m_contract_strides; } - const contract_t& k_strides() const { return m_k_strides; } - - protected: - CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_> m_tensor; - const nocontract_t m_nocontract_strides; - const nocontract_t m_ij_strides; - const contract_t m_contract_strides; - const contract_t m_k_strides; -}; - -template<typename Scalar, typename Index, int side, - typename Tensor, - typename nocontract_t, typename contract_t, - int packet_size, bool inner_dim_contiguous, - bool inner_dim_reordered, int Alignment, template <class> class MakePointer_> -class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_> -{ - public: - typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper; - - EIGEN_DEVICE_FUNC - BaseTensorContractionMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) : - ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } - - template <typename PacketT,int AlignmentType> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::enable_if<internal::unpacket_traits<PacketT>::size==packet_size,PacketT>::type - load(Index i, Index j) const - { - // whole method makes column major assumption - - // don't need to add offsets for now (because operator handles that) - // current code assumes packet size must be a multiple of 2 - EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - - if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) { - const Index index = this->computeIndex(i, j); - eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1); - return this->m_tensor.template packet<AlignmentType>(index); - } - - const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1); - const Index first = indexPair.first; - const Index lastIdx = indexPair.second; - - // We can always do optimized packet reads from left hand side right now, because - // the vertical matrix dimension on the left hand side is never contracting. - // On the right hand side we need to check if the contracting dimensions may have - // been shuffled first. - if (Tensor::PacketAccess && - (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) && - (lastIdx - first) == (packet_size - 1)) { - - return this->m_tensor.template packet<AlignmentType>(first); - } - - EIGEN_ALIGN_MAX Scalar data[packet_size]; - - data[0] = this->m_tensor.coeff(first); - EIGEN_UNROLL_LOOP - for (Index k = 1; k < packet_size - 1; k += 2) { - const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1); - data[k] = this->m_tensor.coeff(internal_pair.first); - data[k + 1] = this->m_tensor.coeff(internal_pair.second); - } - data[packet_size - 1] = this->m_tensor.coeff(lastIdx); - - return pload<PacketT>(data); - } - - template <typename PacketT,int AlignmentType> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::enable_if<internal::unpacket_traits<PacketT>::size!=packet_size,PacketT>::type - load(Index i, Index j) const - { - const Index requested_packet_size = internal::unpacket_traits<PacketT>::size; - EIGEN_ALIGN_MAX Scalar data[requested_packet_size]; - - const IndexPair<Index> indexPair = this->computeIndexPair(i, j, requested_packet_size - 1); - const Index first = indexPair.first; - const Index lastIdx = indexPair.second; - - data[0] = this->m_tensor.coeff(first); - for (Index k = 1; k < requested_packet_size - 1; k += 2) { - const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1); - data[k] = this->m_tensor.coeff(internal_pair.first); - data[k + 1] = this->m_tensor.coeff(internal_pair.second); - } - data[requested_packet_size - 1] = this->m_tensor.coeff(lastIdx); - - return pload<PacketT>(data); - } - - template <typename PacketT,int AlignmentType> - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const { - return this->load<PacketT,AlignmentType>(i,j); - } -}; - - -template<typename Scalar, typename Index, int side, - typename Tensor, - typename nocontract_t, typename contract_t, - bool inner_dim_contiguous, - bool inner_dim_reordered, int Alignment, template <class> class MakePointer_> -class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> - : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_> -{ - public: - typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper; - - EIGEN_DEVICE_FUNC - BaseTensorContractionMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) : - ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } - - template <typename PacketT,int> EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const { - EIGEN_ALIGN_MAX Scalar data[1]; - data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); - return pload<PacketT>(data); - } - template <typename PacketT,int> EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const { - EIGEN_ALIGN_MAX Scalar data[1]; - data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); - return pload<PacketT>(data); - } -}; - - -template<typename Scalar, typename Index, int side, - typename Tensor, - typename nocontract_t, typename contract_t, - int packet_size, - bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer> -class TensorContractionSubMapper { - public: - - typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> ParentMapper; - typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Self; - typedef Self LinearMapper; - - enum { - // We can use direct offsets iff the parent mapper supports then and we can compute the strides. - // TODO: we should also enable direct offsets for the Rhs case. - UseDirectOffsets = ParentMapper::DirectOffsets && (side == Lhs) && inner_dim_contiguous && (array_size<contract_t>::value > 0) - }; - - EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) - : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { - // Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute - // this offset every time we attempt to access a coefficient. - if (UseDirectOffsets) { - Index stride = m_base_mapper.stride(); - m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride); - } - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { - if (UseDirectOffsets) { - return m_base_mapper(i, 0); - } - return m_base_mapper(i + m_vert_offset, m_horiz_offset); - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { - if (UseDirectOffsets) { - return m_base_mapper(i, j); - } - return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); - } - - template <typename PacketT> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i) const { - if (UseDirectOffsets) { - return m_base_mapper.template loadPacket<PacketT,Alignment>(i, 0); - } - return m_base_mapper.template loadPacket<PacketT,Alignment>(i + m_vert_offset, m_horiz_offset); - } - - template <typename PacketT> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const { - if (UseDirectOffsets) { - return m_base_mapper.template loadPacket<PacketT,Alignment>(i, j); - } - return m_base_mapper.template loadPacket<PacketT,Alignment>(i + m_vert_offset, j + m_horiz_offset); - } - - template <typename PacketT, int AlignmentType> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const { - if (UseDirectOffsets) { - return m_base_mapper.template load<PacketT,AlignmentType>(i, j); - } - return m_base_mapper.template loadPacket<PacketT,AlignmentType>(i + m_vert_offset, j + m_horiz_offset); - } - - template <typename PacketT> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketT& p) const { - if (UseDirectOffsets) { - m_base_mapper.storePacket(i, 0, p); - } - m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { - if (UseDirectOffsets) { - return LinearMapper(m_base_mapper, i, j); - } - return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); - } - - template <typename PacketT, int AlignmentType> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { - EIGEN_STATIC_ASSERT((internal::is_same<PacketT, PacketT>::value), YOU_MADE_A_PROGRAMMING_MISTAKE); - const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; - if (UseDirectOffsets) { - return m_base_mapper.template loadPacket<PacketT,ActualAlignment>(i, 0); - } - return m_base_mapper.template loadPacket<PacketT,ActualAlignment>(i + m_vert_offset, m_horiz_offset); - } - - template <typename PacketT> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const { - return false; - } - - #ifdef EIGEN_USE_SYCL - // The placeholder accessors require to be bound to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_base_mapper.bind(cgh); - } - #endif - - const ParentMapper& base_mapper() const { return m_base_mapper; } - Index vert_offset() const { return m_vert_offset; } - Index horiz_offset() const { return m_horiz_offset; } - - private: - ParentMapper m_base_mapper; - const Index m_vert_offset; - const Index m_horiz_offset; -}; - - -template<typename Scalar_, typename Index, int side, - typename Tensor, - typename nocontract_t, typename contract_t, - int packet_size, - bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer> -class TensorContractionInputMapper - : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> { - - public: - typedef Scalar_ Scalar; - typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Base; - typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> SubMapper; - typedef SubMapper VectorMapper; - - EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) - : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { - return SubMapper(*this, i, j); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { - return VectorMapper(*this, i, j); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& get_tensor() const { - return Base::m_tensor; - } -}; - - -template <typename T> struct TensorContractionInputMapperTrait; - -template<typename Scalar_, typename Index_, int side_, - typename Tensor_, - typename nocontract_t_, typename contract_t_, - int packet_size_, - bool inner_dim_contiguous_, bool inner_dim_reordered_, int Alignment_, template <class> class MakePointer_> -struct TensorContractionInputMapperTrait<TensorContractionInputMapper<Scalar_, Index_, side_, Tensor_, - nocontract_t_, contract_t_, packet_size_, inner_dim_contiguous_, - inner_dim_reordered_, Alignment_, MakePointer_> > { - - typedef Tensor_ XprType; - static const bool inner_dim_contiguous = inner_dim_contiguous_; - static const bool inner_dim_reordered = inner_dim_reordered_; - }; - - -} // end namespace internal -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionSycl.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionSycl.h deleted file mode 100755 index 473c228..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionSycl.h +++ /dev/null @@ -1,1650 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// -// This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not -// distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorContractionSycl.h - * - * \brief: - * TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend - * - *****************************************************************/ - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H - -namespace Eigen { - -namespace TensorSycl { -namespace internal { - -#ifndef EIGEN_SYCL_DISABLE_GEMV -/*! - * \brief TVPanelSize, a template class used for setting the panel size required for launching General TensorVector - * contraction kernel on various hardware devices. - * - * \tparam Scalar: determines the element type of the tensor/vector - * - * \tparam StorageIndex determines the Index type. - * - * \tparam NCWindow: determines the number of non-contracting element to be process by each work-group - * - * \tparam CFactor: determines the number of contracting element to be process by each thread - * - * \tparam NCFactor: determines the number of non-contracting element to be process by each thread - */ -template <typename Scalar, typename StorageIndex, StorageIndex NCWindow, StorageIndex CFactor, StorageIndex NCFactor> -struct TVPanelSize { - // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension - static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0; - // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension - static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1; - // TileSizeDimNC: determines the tile size for the non-contracting dimension - static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor; - // TileSizeDimC: determines the tile size for the contracting dimension - static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC; - // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension - static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC; - // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension - static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC; - // BC : determines if supporting bank conflict is required - static EIGEN_CONSTEXPR bool BC = false; -}; -#endif - -/*! - * \brief TTPanelSize, a template class used for setting the panel size required for launching General Tensor Tensor - contraction kernel on various hardware devices. - * - * \tparam Scalar: determines the element type of the tensor - * - * \tparam StorageIndex: determines the Index type. - * - * \tparam REG_SIZE_M: determines workload per thread for loading the M dimension This can be varied based on the - available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro). - * - * \tparam REG_SIZE_N: determines workload per thread for loading the N dimension This can be varied based on the - available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro). - * - * \tparam TSDK: determines Tile size for dimension K. The packet size is assumed to be considered - */ - -template <typename Scalar, typename StorageIndex, StorageIndex REG_SIZE_M, StorageIndex REG_SIZE_N, StorageIndex TSDK> -struct TTPanelSize { - // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered - static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK; - // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the - // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro// -#ifndef EIGEN_SYCL_REG_M - static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M; -#else - static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M; -#endif -// WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the -// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro -#ifndef EIGEN_SYCL_REG_N - static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N; -#else - static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N; -#endif - // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension - static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0; - // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension - static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1; - // TileSizeDimM: determines the tile size for the m dimension - static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM; - // TileSizeDimN: determines the tile size for the n dimension - static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN; - // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisable by packetsize - static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs = - ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN)); - // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisable by packetsize - static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs = - ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM)); - // BC : determines if supporting bank conflict is required - static EIGEN_CONSTEXPR bool BC = true; - // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by - // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device doesnot have sufficient local memory) - static EIGEN_CONSTEXPR bool DoubleBuffer = -#ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER - false; -#else - true; -#endif -}; - -/* ! - * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to - * specialize the contraction algorithm based on device support for dedicated local memory. - */ -enum class contraction_type { local, no_local }; -/* ! - * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private). - */ -enum class data_source { global_mem, local_mem, private_mem }; - -/*! - * \brief read, a template function used for loading the data from global - memory. This function is used to guarantee coalesced and vectorized load whenever possible - * - * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode - * - * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and - vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the - contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case - when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. - * - * \tparam PacketType: determines the type of packet - * - * \tparam TensorMapper: determines the input tensor mapper type - * - * \tparam StorageIndex: determines the Index type - - * \param tensorMapper: is the input tensor - * - * \param NCIndex: is the non-contracting dim index - * - * \param CIndex is the contracting dim index - * - * \param ld: is the leading dimension of the flattened tensor - */ -template <bool PacketLoad, bool is_coalesced_layout, bool, typename PacketType, typename TensorMapper, - typename StorageIndex> -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<PacketLoad, PacketType>::type read( - const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) { - const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex; - const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex; - return tensorMapper.get_tensor().template packet<Unaligned>(row + (col * ld)); -} - -/*! - * \brief read, special overload of read function, when the read access is not vectorized - * - * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode - * - * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and - vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the - contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case - when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. - * - * \tparam PacketType: determines the type of packet - * - * \tparam TensorMapper: determines the input tensor mapper type - * - * \tparam StorageIndex: determines the Index type - - * \param tensorMapper: is the input tensor - * - * \param NCIndex: is the non-contracting dim index - * - * \param CIndex: is the contracting dim index - */ -template <bool PacketLoad, bool, bool IsRhs, typename PacketType, typename TensorMapper, typename StorageIndex> -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!PacketLoad, PacketType>::type read( - const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) { - const StorageIndex row = (IsRhs) ? CIndex : NCIndex; - const StorageIndex col = (IsRhs) ? NCIndex : CIndex; - return tensorMapper(row, col); -} - -/*! - * \brief write, a template function used for storing the data to local memory. This function is used to guarantee - * coalesced and vectorized store whenever possible. - * - * \tparam StorageIndex: determines the Index type - * - * \param ld is the leading dimension of the local memory. ld is a compile time value for the local memory - * - * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. - * - * \tparam PacketType: determines the type of packet - * - * \tparam DataScalar: determines the output data type - * - * \param packet_data: the data to be written in the local memory - * - * \param ptr: a pointer to the local memory - * - * \param CIndex is the contracting dim index - */ - -template <typename StorageIndex, StorageIndex ld, data_source dt, typename PacketType, typename DataScalar> -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename ::Eigen::internal::enable_if<dt != data_source::global_mem, void>::type - write(PacketType &packet_data, DataScalar ptr) { - EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits<PacketType>::size; - EIGEN_UNROLL_LOOP - for (int i = 0; i < PacketSize; i++) { - *ptr = PacketWrapper<PacketType, PacketSize>::scalarize(i, packet_data); - ptr += ld; - } -} - -/*! - * \brief Overloading the write function for storing the data to global memory, when vectorization enabled This function - * is used to guarantee coalesced and vectorized store whenever possible. - * - * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. - * - * \tparam PacketType: determines the type of packet - * - * \tparam DataScalar: determines the output data type - * - * \param packet_data: the data to be written in the local memory - * - * \param ptr: a pointer to the local memory - */ - -template <data_source dt, typename PacketType, typename DataScalar> -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if< - Eigen::internal::unpacket_traits<PacketType>::size != 1 && dt == data_source::global_mem, void>::type -write(PacketType &packet_data, DataScalar *ptr) { - ::Eigen::internal::pstoreu<DataScalar, PacketType>(ptr, packet_data); -} - -/*! - * \brief Overloading the write function for storing the data to global memory, when vectorization is disabled. - * - * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. - * - * \tparam PacketType: determines the type of packet - * - * \tparam DataScalar: determines the output data type - * - * \param packet_data: the data to be written in the local memory - * - * \param ptr: a pointer to the local memory - */ -template <data_source dt, typename PacketType, typename DataScalar> -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if< - Eigen::internal::unpacket_traits<PacketType>::size == 1 && dt == data_source::global_mem, void>::type -write(PacketType &packet_data, DataScalar *ptr) { - *ptr = packet_data; -} - -/*! - * \brief check_boundary: is used to check the edge condition for non-internal blocks. - * - * \tparam is_internal: determines if the block is internal - */ -template <bool is_internal> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool) { - return true; -} - -/*! - * \brief check_boundary: specialization of the check_boundary for non-internal blocks. - * - * \param cond: true when the data is in range. Otherwise false - */ -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary<false>(bool cond) { - return cond; -} - -/*! - * \brief BlockProperties is a template class that provides different characteristic of a block of each Tensor processed - * by each workgroup. - * - * \tparam is_transposed: iff true, determines whether or not the block of the Tensor is transposed - * - * \tparam packet_load_: determines if the each element of this tensor block should be loaded in a packet mode - * - * \tparam PacketType: determines the type of packet - * - * \tparam OutType: determines the type of each element for this block of tensor. If packet load is true, it will be - * packetType; Otherwise it will be scalar Type - * - * \param elements_per_access determines the size of each element based on OutType - * - * \param is_coalesced_layout determines whether or not the Tensor data in a memory can be access coalesced and - * vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the - * contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case - * when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. - * - * \param nc_stride determines the stride of non-contracting dimension to access the next adjustment element within the - * Tensor Block for each workgroup - * - * \param c_stride determines the stride of contracting dimension to access the next adjustment element within the - * Tensor Block for each workgroup - */ -template <bool is_transposed, bool is_rhs_, bool packet_load_, typename PacketType> -struct BlockProperties { - static EIGEN_CONSTEXPR bool packet_load = packet_load_; - typedef typename Eigen::internal::unpacket_traits<PacketType>::type OutScalar; - static EIGEN_CONSTEXPR bool is_rhs = is_rhs_; - typedef typename Eigen::internal::conditional<packet_load, PacketType, OutScalar>::type OutType; - static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits<OutType>::size; - static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs); - static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1); - static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access); -}; - -/*! - * \brief ThreadProperties is a template class that provides each thread's properties within a workgroup. Please see - * the sycl-1.2.1 specification (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for the workgroup, - * work-items - * - * \tparam StorageIndex: determines the StorageIndex Type - * - * \param linearLocalThreadId: determines the linearized location of a thread within a work-group - * - * \param kGroupId: determines the logical group id in a k dimension of the flattened tensor. It will be > 1 when - * tall/skinny algorithm is used - * - * \param mGroupOffset: determines the logical start position of all thread within a workgroup for the m dimension of - * the flattened tensor. - * - * \param kGroupOffset determines the logical start position of all thread within a workgroup for the k dimension of the - * flattened tensor. It will be > 1 when tall/skinny algorithm is used. - * - * \param mLocalOffset: determines the logical start position of each thread within a workgroup for the m dimension of a - * flattened tensor. The position determines the distance of each thread within the workgroup from each other - * independent from their global position. - * - * \param nLocalOffset: determines the logical start position of each thread within a workgroup for the n dimension of a - * flattened tensor. The position determines the distance of each thread within the workgroup from each other - * independent from their global position. - * - * \param mGlobalOffset: determines the logical start position of each thread a thread for the m dimension on a - * flattened tensor - * - * \param nGlobalOffset: determines the logical start position of each thread a thread for the n dimension on a - * flattened tensor - * - * \param kSize : determine the number of the k elements of the flattened Tensor to be processed by each thread for the - * given tensor block. This is !=K dimension of Flattened Tensor when Tall/Skinny matrix is used. - * - * \param is_internal : this will determined if the thread within the work-group computes an internal block of tensor or - * the edge blocks. When it is internal, there is no need to check the boundaries and all the if stantement can be - * resolve by compiler. - */ -template <typename StorageIndex> -struct ThreadProperties { - const StorageIndex linearLocalThreadId; - const StorageIndex kGroupId; - const StorageIndex mGroupOffset; - const StorageIndex nGroupOffset; - const StorageIndex kGroupOffset; - const StorageIndex mLocalOffset; - const StorageIndex nLocalOffset; - const StorageIndex mGlobalOffset; - const StorageIndex nGlobalOffset; - StorageIndex kSize; - const bool is_internal; - // this is used to adjust the last block - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties( - const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_, - const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_, - const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_, - StorageIndex kSize_, const bool is_internal_) - : linearLocalThreadId(linearLocalThreadId_), - kGroupId(kGroupId_), - mGroupOffset(mGroupOffset_), - nGroupOffset(nGroupOffset_), - kGroupOffset(kGroupOffset_), - mLocalOffset(mLocalOffset_), - nLocalOffset(nLocalOffset_), - mGlobalOffset(mGlobalOffset_), - nGlobalOffset(nGlobalOffset_), - kSize(kSize_), - is_internal(is_internal_) {} -}; - -/*! - * \brief TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation. - * - * \tparam OutScalar: determines the output scalar type - * - * \tparam LhsScalar: determines the left-hand-side scalar type - * - * \tparam RhsScalar: determines the right-hand-side scalar type - * - * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification - (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) - * - * \tparam LhsMapper determines the tensor contraction mapper type for left-hand-side matrix - * - * \tparam RhsMapper determines the tensor contraction mapper type for right-hand-side matrix - * - * \tparam StorageIndex: determines the StorageIndex Type - * - * \tparam Properties: determines the Contraction Panel properties - * - * \tparam TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix - * - * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. - * - * \tparam input_mapper_properties : determine if the input tensors are matrix. If they are matrix, special memory - access is used to guarantee that always the memory access are coalesced. - * - * \tptaram IsFinal : determine if this is the final kernel. If so, the result will be written in a final output. - Otherwise, the result of contraction will be written iin a temporary buffer. This is the case when Tall/Skinny - contraction is used. So in this case, a final reduction step is required to compute final output. - - * \tparam contraction_tp: it is an enum value representing whether the local memroy/no local memory implementation of - the algorithm to be used - * - * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group - * - * \param lhs: determines the left-hand-side flattened tensor (tensor mapper) - * - * \param rhs: determines the right-hand-side flattened tensor (tensor mapper) - * - * \param out_res: determines the output tensor containing the contraction result - * - * \param groupSizeM: a logical number determining the number of work-group for m dimension - * - * \param groupSizeN: a logical number determining the number of work-group for n dimension - * - * \param numTiles: determines total number of tiles on the k dimension - * - * \param TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix - */ -template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper, - typename RhsMapper, typename StorageIndex, typename Properties, typename TripleDim, bool Vectorizable, - typename input_mapper_properties, bool IsFinal, contraction_type contraction_tp> -class TensorContractionKernel { - public: - typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType - PacketReturnType; - static EIGEN_CONSTEXPR int PacketSize = - Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize; - static EIGEN_CONSTEXPR bool is_lhs_transposed = - !::Eigen::internal::TensorContractionInputMapperTrait<LhsMapper>::inner_dim_contiguous; - static EIGEN_CONSTEXPR bool is_rhs_transposed = - !::Eigen::internal::TensorContractionInputMapperTrait<RhsMapper>::inner_dim_contiguous; - - typedef BlockProperties<is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix && Vectorizable, - PacketReturnType> - LHSBlockProperties; - - typedef BlockProperties<is_rhs_transposed, true, input_mapper_properties::is_rhs_matrix && Vectorizable, - PacketReturnType> - RHSBlockProperties; - - static EIGEN_CONSTEXPR StorageIndex NStride = - contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride; - - typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch; - typedef cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::local_space> local_ptr; - typedef OutScalar * /*cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::private_space>*/ private_ptr; - typedef - typename ::Eigen::internal::conditional<contraction_tp == contraction_type::local, local_ptr, private_ptr>::type - tile_ptr; - static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local - ? Properties::TileSizeDimM + Properties::BC - : Properties::WorkLoadPerThreadM; - static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local - ? Properties::TileSizeDimN + Properties::BC - : Properties::WorkLoadPerThreadN; - static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN; - - /** - * \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not - * allowed to have dynamic memory allocation. While the local memory is created outside of the kernel and passed to - * the kernel as an accessor, the private memory can only allowed to be allocated statically. Since we are abstracting - * the TiledMemory for both local and private memory, the MemHolder structs is used as a helper to abstract out - * different type of memory needed when local/no_local memory computation is called. - * - * \tparam contraction_type: it is an enum value representing whether the local memroy/no local memory implementation - of the algorithm to be used - * \tparam the private memory size - * \param ptr the tile memory pointer type - */ - template <contraction_type, StorageIndex> - struct MemHolder { - tile_ptr ptr; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {} - }; - /** - * \brief specialization of memHolder class when no local memory kernel is used. - */ - template <StorageIndex MemSize> - struct MemHolder<contraction_type::no_local, MemSize> { - OutScalar ptr[MemSize] = {OutScalar{0}}; - }; - /** - * \brief TiledMemory: contains required memory pointer for loading each tile of the TensorContraction panel from - * global memory to local/private memory when local/no_local algorithm used. - * - * \param lhs_scratch_extract : determines the LHS tile memory. It is either private or local memory based on the - * selected contraction_type. - * - * \param rhs_scratch_extract : determines the RHS tile memory. It is either private or local memory based on the - * selected contraction_type. - * - * \param lhs_extract_index: determins the position of each thread on a local memory for lhs input. When private - * memory is used this is set to zero as this is not applicable in case of private memory. - * - * \param rhs_extract_index: determins the position of each thread on a local memory for rhs input. When private - * memory is used this is set to zero as this is not applicable in case of private memory. - * - * \param lhs_scratch_compute : determines the location to load for computation for lhs_local memory. This is the - * same as lhs_scratch_extract for private memory. - * - * \param rhs_scratch_compute : determines the location to load for computation for rhs_local memory. This is the - * same as rhs_scratch_extract for private memory. - */ - struct TiledMemory { - MemHolder<contraction_tp, Properties::WorkLoadPerThreadM * Properties::TileSizeDimK> lhs_scratch_extract; - MemHolder<contraction_tp, Properties::WorkLoadPerThreadN * Properties::TileSizeDimK> rhs_scratch_extract; - tile_ptr lhs_scratch_ptr_compute; - tile_ptr rhs_scratch_ptr_compute; - const std::pair<StorageIndex, StorageIndex> lhs_extract_index; - const std::pair<StorageIndex, StorageIndex> rhs_extract_index; - template <contraction_type tp = contraction_tp> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TiledMemory(const ThreadProperties<StorageIndex> &, local_ptr, - typename ::Eigen::internal::enable_if<tp == contraction_type::no_local>::type * = 0) - : lhs_scratch_extract{}, - rhs_scratch_extract{}, - lhs_scratch_ptr_compute(lhs_scratch_extract.ptr), - rhs_scratch_ptr_compute(rhs_scratch_extract.ptr), - lhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})), - rhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})) {} - - template <contraction_type tp = contraction_tp> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TiledMemory(const ThreadProperties<StorageIndex> &thread_properties, local_ptr block_start_ptr, - typename ::Eigen::internal::enable_if<tp == contraction_type::local>::type * = 0) - : lhs_scratch_extract{block_start_ptr}, - rhs_scratch_extract{lhs_scratch_extract.ptr + - ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)}, - lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset), - rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset), - lhs_extract_index( - local_id_extract<LHSBlockProperties, Properties::TileSizeDimM>(thread_properties.linearLocalThreadId)), - rhs_extract_index( - local_id_extract<RHSBlockProperties, Properties::TileSizeDimN>(thread_properties.linearLocalThreadId)) {} - }; - - Scratch scratch; - const LhsMapper lhs; - const RhsMapper rhs; - OutAccessor out_res; - const StorageIndex groupSizeM; - const StorageIndex groupSizeN; - const StorageIndex numTiles; - const TripleDim triple_dim; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, - const RhsMapper rhs_, OutAccessor out_res_, - const StorageIndex groupSizeM_, - const StorageIndex groupSizeN_, - const StorageIndex numTiles_, - const TripleDim triple_dim_) - : scratch(scratch_), - lhs(lhs_), - rhs(rhs_), - out_res(out_res_), - groupSizeM(groupSizeM_), - groupSizeN(groupSizeN_), - numTiles(numTiles_), - triple_dim(triple_dim_) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, - const RhsMapper rhs_, OutAccessor out_res_, - const StorageIndex groupSizeM_, - const StorageIndex numTiles_, - const TripleDim triple_dim_) - : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { - const StorageIndex linearLocalThreadId = itemID.get_local_id(0); - const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM; - const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM; - const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM; - const StorageIndex tmp = itemID.get_group(0) / groupSizeM; - const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN; - const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN; - const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM; - const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN; - const StorageIndex mLocalOffset = PacketSize * mLocalThreadId; - const StorageIndex nLocalOffset = NStride * nLocalThreadId; - const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset; - const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset; - - const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK; - StorageIndex kGroupOffset = kGroupId * kSizePerWG; - const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM && - triple_dim.N - nGroupOffset >= Properties::TileSizeDimN && - triple_dim.K - kGroupOffset >= kSizePerWG; - // this is used to adjust the last block - StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset); - // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to - // tile - kGroupOffset += kSize; - - auto thread_properties = - ThreadProperties<StorageIndex>(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset, - mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal); - - auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N); - - (thread_properties.is_internal) ? compute_panel<true>(itemID, thread_properties, out_ptr) - : compute_panel<false>(itemID, thread_properties, out_ptr); - } - // The compute block computes the contraction operation private block for each thread and store the resutl in the - // privateRes memory of Each computation the compute block function is independent of local and no local concepts as - // it only compute the block on each thread's private memory space - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr, - PacketReturnType *privateRes) { - StorageIndex idx = 0; - EIGEN_CONSTEXPR StorageIndex lhs_stride = - contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1; - EIGEN_UNROLL_LOOP - for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) { - auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)}; - StorageIndex lhs_index = 0; - EIGEN_UNROLL_LOOP - for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) { - PacketReturnType lhsPack{}; - Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::set_packet(lhsPack, - lhs_block_ptr + lhs_index); - privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]); - - lhs_index += lhs_stride; - idx++; - } - } - } - // The store function write the computed contraction operation in the private memory of each thread to the global - // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base - // class. - template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes, - StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) { - auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC { - return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N); - }; - // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is - // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId* - // WorkLoadPerThreadN slice of N - EIGEN_CONSTEXPR StorageIndex GlobalNStride = - contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN; - EIGEN_UNROLL_LOOP - for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) { - // output leading dimension - StorageIndex outputLD = 0; - // When local memory is used the PrivateNstride is always 1 because the coalesed access on N is loaded into Local - // memory and extracting from local to global is the same as no transposed version. However, when local memory is - // not used and RHS is transposed we packetize the load for RHS. - EIGEN_UNROLL_LOOP - for (StorageIndex nId = 0; nId < PrivateNStride; nId++) { - StorageIndex globalRow = mGlobalOffset; - EIGEN_UNROLL_LOOP - for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) { - PacketReturnType privetOut = privateRes[wLPTM]; - if (check_boundary<is_internal_block>(chk_bound(globalRow, nId))) { - // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second - // StorageIndex Therefore it is always coalesced layout - write<data_source::global_mem>(privetOut, out_ptr + outputLD + globalRow); - } else { - EIGEN_UNROLL_LOOP - for (StorageIndex mId = 0; mId < PacketSize; mId++) { - StorageIndex mOffset = globalRow + mId; - if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) { - out_ptr[mOffset + outputLD] = - Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::scalarize(mId, privetOut); - } - } - } - globalRow += (PacketSize * Properties::LocalThreadSizeM); - } - outputLD += triple_dim.M; - privateRes += Properties::WorkLoadPerThreadM / PacketSize; - } - out_ptr += (GlobalNStride * outputLD); - - nGlobalOffset += (PrivateNStride * GlobalNStride); - } - } - // when no local memory is used the following extract_block will be enabled - template <typename InputBlockProperties, bool is_internal_block, typename Input, typename PrivateReg, - contraction_type contract_tp = contraction_tp> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename ::Eigen::internal::enable_if<contract_tp == contraction_type::no_local>::type - extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &, - const StorageIndex &ncOffset, const StorageIndex cOffset) { - EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = - InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM; - EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = - InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM; - const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M; - - auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC { - return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) && - (NCIndex + InputBlockProperties::nc_stride - 1 < NC)); - }; - const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K; - StorageIndex cIndex = cOffset; - - EIGEN_UNROLL_LOOP - for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) { - StorageIndex ncIndex = ncOffset; - EIGEN_UNROLL_LOOP - for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) { - if (check_boundary<is_internal_block>(chk_bound(cIndex, ncIndex))) { - auto val = - read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout, - InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, ncIndex, cIndex, ld); - - write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC), - data_source::private_mem>(val, private_ptr); - } else { - EIGEN_UNROLL_LOOP - for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { - const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0); - const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i); - OutScalar val = - (ncInd < NC && cInd < triple_dim.K) - ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>( - inpt, ncInd, cInd, ld) - : OutScalar(0); - write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC), - data_source::private_mem>( - val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) + - ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC)); - } - } - - // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So - // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread. - ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1) - ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC - : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC); - private_ptr += InputBlockProperties::nc_stride; - } - // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC - private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC; - cIndex += InputBlockProperties::c_stride; - } - } - template <typename InputBlockProperties, StorageIndex TileSizeDimNC> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair<StorageIndex, StorageIndex> local_id_extract( - const StorageIndex &linearLocalThreadId) { - const StorageIndex localThreadNC = - (InputBlockProperties::is_coalesced_layout) - ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride) - : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride); - const StorageIndex localThreadC = - (InputBlockProperties::is_coalesced_layout) - ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride) - : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride); - return std::pair<StorageIndex, StorageIndex>(localThreadNC, localThreadC); - } - - template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename ::Eigen::internal::enable_if<db && ctp == contraction_type::local>::type - sync_mem(const cl::sycl::nd_item<1> &, bool &db_offset) noexcept { - db_offset = !db_offset; - } - - template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename ::Eigen::internal::enable_if<!db && ctp == contraction_type::local>::type - sync_mem(const cl::sycl::nd_item<1> &itemID, bool &) noexcept { - itemID.barrier(cl::sycl::access::fence_space::local_space); - } - - template <contraction_type ctp = contraction_tp> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename ::Eigen::internal::enable_if<ctp == contraction_type::no_local>::type - sync_mem(const cl::sycl::nd_item<1> &, bool &) noexcept { - return; - } - - template <bool need_sync, contraction_type ctp = contraction_tp> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::no_local>::type - sync_thread(const cl::sycl::nd_item<1> & -#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION - itemID -#endif - ) noexcept { -#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION - itemID.barrier(cl::sycl::access::fence_spacce::local_space); -#else - return; -#endif - } - template <bool need_sync, contraction_type ctp = contraction_tp> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::local>::type - sync_thread(const cl::sycl::nd_item<1> &itemID) { - itemID.barrier(cl::sycl::access::fence_space::local_space); - } - template <bool need_sync> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!need_sync>::type sync_thread( - const cl::sycl::nd_item<1> &) { - return; - } - - template <bool is_internal_block> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID, - ThreadProperties<StorageIndex> &thread_properties, - TiledMemory &tiled_input_block, - PacketReturnType *privateRes, bool &db_offset) { - // Tiling the Rhs block from global to local memory - extract_block<RHSBlockProperties, is_internal_block>( - rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR), - tiled_input_block.rhs_extract_index, - contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset, - thread_properties.kGroupOffset - thread_properties.kSize); - - sync_thread<contraction_tp == contraction_type::no_local>(itemID); - - // Tiling the Lhs block from global to local memory - extract_block<LHSBlockProperties, is_internal_block>( - lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK), - tiled_input_block.lhs_extract_index, - contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset, - thread_properties.kGroupOffset - thread_properties.kSize); - - // itemID.barrier(cl::sycl::access::fence_space::local_space); - sync_thread<contraction_tp == contraction_type::local>(itemID); - // switch to compute mede - StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK); - StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR); - // Loop over the values of a single tile - for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) { - compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset, - tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes); - lhs_offset += LSDL; - rhs_offset += LSDR; - } - // computing the K index for the next tile - thread_properties.kSize -= Properties::TileSizeDimK; - sync_mem(itemID, db_offset); - } - - // when local memory is available the following compute_panel will be enabled - template <bool is_internal_block, typename OutPtr> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID, - ThreadProperties<StorageIndex> &thread_properties, - OutPtr out_ptr) { - auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()}; - // Allocate register space - PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = { - PacketReturnType{0}}; - bool db_offset = 0; - - while (thread_properties.kSize >= Properties::TileSizeDimK) { - compute_tile_per_panel<is_internal_block>(itemID, thread_properties, tiled_input_block, privateRes, db_offset); - } - if (thread_properties.kSize > 0) { - compute_tile_per_panel<false>(itemID, thread_properties, tiled_input_block, privateRes, db_offset); - } - - // Storing the final results in the output - store<is_internal_block, - contraction_tp == contraction_type::local ? static_cast<StorageIndex>(1) : RHSBlockProperties::nc_stride>( - out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset, - thread_properties.nGlobalOffset); - } - // When local memory is available the following extract_block will be enabled - template <typename InputBlockProperties, bool is_internal_block, typename Input, typename Local, - contraction_type contract_tp = contraction_tp> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename ::Eigen::internal::enable_if<contract_tp == contraction_type::local>::type - extract_block(const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex>& local_index, - const StorageIndex &ncOffset, const StorageIndex cOffset) { - EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = - InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM; - EIGEN_CONSTEXPR StorageIndex LoadPerThread = - InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs; - EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL; - static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) && - (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)), - " LocalOffset must be divisable by stride"); - const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M; - StorageIndex localThreadNC = local_index.first; - StorageIndex localThreadC = local_index.second; - auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC { - return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) && - (NCIndex + InputBlockProperties::nc_stride - 1 < NC)); - }; - EIGEN_UNROLL_LOOP - for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) { - const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC); - const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC); - const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K; - if (check_boundary<is_internal_block>(chk_bound(CIndex, NCIndex))) { - auto val = - read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout, - InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, NCIndex, CIndex, ld); - write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>( - val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) + - (InputBlockProperties::c_stride * localThreadC * LSD)); - } else { - EIGEN_UNROLL_LOOP - for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { - const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0); - const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i); - OutScalar val = - (nCInd < NC && cInd < triple_dim.K) - ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>( - inpt, nCInd, cInd, ld) - : OutScalar(0); - - write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>( - val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) + - (InputBlockProperties::is_coalesced_layout ? i : 0) + - ((InputBlockProperties::c_stride * localThreadC + - (InputBlockProperties::is_coalesced_layout ? 0 : i)) * - LSD)); - } - } - localThreadNC += (InputBlockProperties::is_coalesced_layout) - ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) - : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride); - localThreadC += (InputBlockProperties::is_coalesced_layout) - ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride) - : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride); - } - } -}; - -#ifndef EIGEN_SYCL_DISABLE_GEMV - -/*! - * \brief GeneralVectorTensor is a template class that provides Tensor -vector contraction operation, which is a special - * case of Tensor Tensor contraction. - * - * \tparam OutScalar: determines the output scalar type - * - * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification - * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) - * - * \tparam VectorMapper: determines the tensor contraction mapper for the vector input (can be lhs or rhs) - * - * \tparam TensorMapper: determines the tensor contraction mapper for the tensor input (can be lhs or rhs) - * - * \tparam StorageIndex: determines the StorageIndex Type - * - * \tparam Properties: determines the Contraction Panel properties - * - * \tparam KFactor: determines the number of elements in K dimension in a Tile - * - * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. - * - * \tparam is_lhs_vec: determines whether lhs is a vector or rhs is a vector - * - * \tparam IsFinal: determine if this is the final kernel. If so, the result will be written in a final output. - * Otherwise, the result of contraction will be written iin a temporary buffer. - * - * \param scratch: determines the local memory containing the vector block for each work-group - * - * \param vec: determines the vector input (tensor mapper) - * - * \param mat: determines the tensor input (tensor mapper) - * - * \param out_res: determines the output vector containing the contraction result - * - * \param nonContractGroupSize: a logical number determining the number of work-group for non-contracting dimension - * - * \param nonContractDim: determines the size of non contracting dimension for the flattened tensor - * - * \param contractDim: determines the size of non contracting dimension for the flattened tensor - * - */ -template <typename OutScalar, typename OutAccessor, typename VectorMapper, typename TensorMapper, typename StorageIndex, - typename Properties, StorageIndex KFactor, bool Vectorizable, bool is_lhs_vec, bool IsFinal> -struct GeneralVectorTensor { - typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType - PacketReturnType; - static EIGEN_CONSTEXPR int PacketSize = - Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize; - typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch; - - static EIGEN_CONSTEXPR StorageIndex OutScratchOffset = - KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC; - - // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make - // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true. - typedef BlockProperties<is_lhs_vec ? false : true, is_lhs_vec ? false : true, Vectorizable, PacketReturnType> - VecBlockProperties; - - Scratch scratch; - const VectorMapper vec; - const TensorMapper mat; - OutAccessor out_res; - const StorageIndex nonContractGroupSize; - const StorageIndex nonContractDim; - const StorageIndex contractDim; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_, - const TensorMapper mat_, OutAccessor out_res_, - const StorageIndex nonContractGroupSize_, - const StorageIndex nonContractDim_, - const StorageIndex contractDim_) - : scratch(scratch_), - vec(vec_), - mat(mat_), - out_res(out_res_), - nonContractGroupSize(nonContractGroupSize_), - nonContractDim(nonContractDim_), - contractDim(contractDim_) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { - auto scratch_ptr = scratch.get_pointer(); - const StorageIndex linearLocalThreadId = itemID.get_local_id(0); - StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC - : linearLocalThreadId % Properties::LocalThreadSizeNC; - StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC - : linearLocalThreadId / Properties::LocalThreadSizeNC; - const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize; - const StorageIndex nonContractGroupId = - is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize; - const StorageIndex contractGroupId = - is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize; - auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : contractGroupId * nonContractDim); - - const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC; - const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC; - auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC; - const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId; - const StorageIndex globalContractDimOffset = contractGroupOffset + contractId; - auto local_output = scratch_ptr + OutScratchOffset; - const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC && - contractDim - contractGroupOffset >= Properties::TileSizeDimC; - is_internal - ? compute_panel<true>(itemID, vec, mat, local_output, out_ptr, -#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON - scratch_ptr, contractGroupOffset, -#endif - nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId, - nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex) - : compute_panel<false>(itemID, vec, mat, local_output, out_ptr, -#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON - scratch_ptr, contractGroupOffset, -#endif - nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId, - nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex); - } - template <bool is_internal_block, typename OutPtr> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel( - const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output, - OutPtr out_ptr, -#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON - OutScalar *scratch_ptr, const StorageIndex contractGroupOffset, -#endif - const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim, - StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId, - StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) { - OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)}; - // Reading the vector -#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON - const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId; - extract_block<VecBlockProperties, is_internal_block, KFactor, - Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC>(vec, scratch_ptr, linearLocalThreadId, - vectorOffset, contractDim); - - itemID.barrier(cl::sycl::access::fence_space::local_space); - auto in_scratch_ptr = scratch_ptr + contractId; -#endif - - StorageIndex privateOffsetC = 0; - EIGEN_UNROLL_LOOP - for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) { - StorageIndex privateOffsetNC = 0; - bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim); -#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON - auto vecScalar = *in_scratch_ptr; -#else - auto vecScalar = (check_boundary<is_internal_block>(contract_conds)) - ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC, - is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0)) - : OutScalar(0); -#endif - EIGEN_UNROLL_LOOP - for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { - auto matScalar = (check_boundary<is_internal_block>( - contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim))) - ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC - : globalNonContractDimOffset + privateOffsetNC, - is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC - : globalContractDimOffset + privateOffsetC) - : OutScalar(0); - - outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]); - privateOffsetNC += Properties::LocalThreadSizeNC; - } - privateOffsetC += Properties::LocalThreadSizeC; -#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON - in_scratch_ptr += Properties::LocalThreadSizeC; -#endif - } - - auto out_scratch_ptr = local_output + outScratchIndex; - // Each block of 16*16 element in shared memory should reduce to 16*1 - EIGEN_UNROLL_LOOP - for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { - *out_scratch_ptr = outScalar[j]; - - out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); - } - if (is_lhs_vec) { - nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC; - contractId = linearLocalThreadId / Properties::LocalThreadSizeNC; - outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC; - } - - out_scratch_ptr = local_output + outScratchIndex; - EIGEN_UNROLL_LOOP - for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { - EIGEN_UNROLL_LOOP - for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) { - itemID.barrier(cl::sycl::access::fence_space::local_space); - if (contractId < offset) { - StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset); - *out_scratch_ptr += out_scratch_ptr[myNeigbourId]; - } - } - // moving to next 16 by 16 block - out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); - } - - if (contractId == 0) { - out_scratch_ptr = local_output + nonContractId; - StorageIndex global_final_offset = nonContractGroupOffset + nonContractId; - out_ptr += global_final_offset; - EIGEN_UNROLL_LOOP - for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { - if (check_boundary<is_internal_block>(global_final_offset < nonContractDim)) { - auto res = *out_scratch_ptr; - - *out_ptr = res; - out_ptr += Properties::LocalThreadSizeNC; - } - // moving to next 16 by 16 block to ge the next 16 reduced elements - out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); - if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC; - } - } - } - - template <typename InputBlockProperties, bool is_internal_block, int CFactor, int GroupSize, typename Input, - typename Local> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr, - const StorageIndex &linearLocalThreadId, - const StorageIndex &cOffset, const StorageIndex &C) { - local_ptr += InputBlockProperties::c_stride * linearLocalThreadId; - StorageIndex cIndex = cOffset; - for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) { - if (check_boundary<is_internal_block>(cIndex + InputBlockProperties::c_stride - 1 < C)) { - auto val = read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout, - InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, StorageIndex(0), - cIndex, StorageIndex(1)); - write<StorageIndex, 1, data_source::local_mem>(val, local_ptr); - } else { - EIGEN_UNROLL_LOOP - for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { - OutScalar val = - (cIndex + i < C) - ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>( - inpt, StorageIndex(0), cIndex + i, StorageIndex(1)) - : OutScalar(0); - write<StorageIndex, 1, data_source::local_mem>(val, local_ptr + i); - } - } - local_ptr += InputBlockProperties::c_stride * GroupSize; - cIndex += InputBlockProperties::c_stride * GroupSize; - } - } -}; -#endif - -#ifndef EIGEN_SYCL_DISABLE_SCALAR - -/*! - * \brief GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contraction - * operation, when all the dimensions are contracting dimensions. This Kernel reduces two tensors to an scalar - * - * \tparam OutScalar: determines the output scalar type - * - * \tparam LhsScalar: determines the left-hand-side scalar type - * - * \tparam RhsScalar: determines the right-hand-side scalar type - * - * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification - * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) - * - * \tparam LhsMapper: determines the tensor contraction mapper type for left-hand-side matrix - * - * \tparam RhsMapper: determines the tensor contraction mapper type for right-hand-side matrix - * - * \tparam StorageIndex: determines the StorageIndex Type - * - * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. - * - * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group - * - * \param lhs: determines the left-hand-side flattened tensor (tensor mapper) - * - * \param rhs: determines the right-hand-side flattened tensor (tensor mapper) - * - * \param out_res: determines the output tensor containing the contraction result - * - * \param rng: determins the total input data size - */ -template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper, - typename RhsMapper, typename StorageIndex, bool Vectorizable> -struct GeneralScalarContraction { - typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch; - Scratch scratch; - const LhsMapper lhs; - const RhsMapper rhs; - OutAccessor out_res; - const StorageIndex rng; - - EIGEN_DEVICE_FUNC - GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_, - const StorageIndex rng_) - : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {} - - EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) { - auto out_ptr = out_res.get_pointer(); - auto scratch_ptr = scratch.get_pointer().get(); - - StorageIndex globalid = itemID.get_global_id(0); - StorageIndex localid = itemID.get_local_id(0); - OutScalar accumulator = OutScalar(0); - for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) { - accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator); - } - auto out_scratch_ptr = scratch_ptr + localid; - *out_scratch_ptr = accumulator; - for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) { - itemID.barrier(cl::sycl::access::fence_space::local_space); - if (localid < offset) { - *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]); - } - } - if (localid == 0) { - out_ptr[itemID.get_group(0)] = accumulator; - } - } -}; -#endif - -} // namespace internal -} // namespace TensorSycl - -template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType> -struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, - Eigen::SyclDevice> - : public TensorContractionEvaluatorBase<TensorEvaluator< - const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Eigen::SyclDevice>> { - static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value, - "SYCL tensor contraction does not support output kernels."); - - typedef Eigen::SyclDevice Device; - - typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self; - typedef TensorContractionEvaluatorBase<Self> Base; - typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType; - typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; - typedef typename XprType::Index StorageIndex; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef typename Base::Storage Storage; - typedef typename Base::EvaluatorPointerType EvaluatorPointerType; - struct TripleDim { - const StorageIndex M; - const StorageIndex N; - const StorageIndex K; - TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {} - }; - enum { - Layout = TensorEvaluator<LeftArgType, Device>::Layout, - PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), - BlockAccess = false, - }; - - static EIGEN_CONSTEXPR int LDims = Base::LDims; - static EIGEN_CONSTEXPR int RDims = Base::RDims; - static EIGEN_CONSTEXPR int ContractDims = Base::ContractDims; - - typedef array<StorageIndex, LDims> left_dim_mapper_t; - typedef array<StorageIndex, RDims> right_dim_mapper_t; - - typedef array<StorageIndex, ContractDims> contract_t; - typedef array<StorageIndex, LDims - ContractDims> left_nocontract_t; - typedef array<StorageIndex, RDims - ContractDims> right_nocontract_t; - - static const int NumDims = LDims + RDims - 2 * ContractDims; - - typedef DSizes<StorageIndex, NumDims> Dimensions; - - typedef TensorEvaluator<typename Base::EvalLeftArgType, Device> LeftEvaluator; - typedef TensorEvaluator<typename Base::EvalRightArgType, Device> RightEvaluator; - typedef typename Eigen::internal::remove_const<typename LeftEvaluator::CoeffReturnType>::type LhsScalar; - typedef typename Eigen::internal::remove_const<typename RightEvaluator::CoeffReturnType>::type RhsScalar; - - typedef typename LeftEvaluator::Dimensions LeftDimensions; - typedef typename RightEvaluator::Dimensions RightDimensions; - - template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> - struct input_mapper_propertis { - static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous; - static EIGEN_CONSTEXPR bool is_rhs_matrix = - (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered); - }; - - TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {} - - // We need to redefine this method to make nvcc happy - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) { - this->m_leftImpl.evalSubExprsIfNeeded(NULL); - this->m_rightImpl.evalSubExprsIfNeeded(NULL); - if (!data) { - this->m_result = this->m_device.get( - static_cast<Scalar *>(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar)))); - data = this->m_result; - } - evalToSycl(data); - return (this->m_result != NULL); - } - const Eigen::SyclDevice &device() const { return this->m_device; } - void evalToSycl(typename Base::EvaluatorPointerType buffer) const { - if (this->m_lhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped<true, true, true, Unaligned>(buffer); - } else { - evalTyped<true, true, false, Unaligned>(buffer); - } - } else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped<true, false, true, Unaligned>(buffer); - } else { - evalTyped<true, false, false, Unaligned>(buffer); - } - } - } else { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped<false, true, true, Unaligned>(buffer); - } else { - evalTyped<false, true, false, Unaligned>(buffer); - } - } else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped<false, false, true, Unaligned>(buffer); - } else { - evalTyped<false, false, false, Unaligned>(buffer); - } - } - } - } - - template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> - void evalTyped(typename Base::EvaluatorPointerType buffer) const { - const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size}; - typedef internal::TensorContractionInputMapper< - LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t, - PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakeSYCLPointer> - LhsMapper; - - typedef internal::TensorContractionInputMapper<RhsScalar, StorageIndex, internal::Rhs, RightEvaluator, - right_nocontract_t, contract_t, - PacketType<CoeffReturnType, Device>::size, rhs_inner_dim_contiguous, - rhs_inner_dim_reordered, Unaligned, MakeSYCLPointer> - RhsMapper; - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - -#ifndef EIGEN_SYCL_DISABLE_SCALAR - if (triple_dim.M == 1 && triple_dim.N == 1) { - launchSC(buffer, lhs, rhs, triple_dim.K); - } else -#endif -#ifndef EIGEN_SYCL_DISABLE_GEMV - if (triple_dim.M != 1 && triple_dim.N == 1) { - LaunchVT<false>(buffer, rhs, lhs, triple_dim.M, triple_dim.K); - } else if (triple_dim.M == 1 && triple_dim.N != 1) { - LaunchVT<true>(buffer, lhs, rhs, triple_dim.N, triple_dim.K); - } else // This is equivalent of if (m!=1 && n!=1) -#endif - { - typedef input_mapper_propertis<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered> - inpt_mapper_properties; -#ifndef EIGEN_SYCL_DISABLE_SKINNY - bool skinny = false; - auto platform_name = this->device().getPlatformName(); - // This is based on empirical calculation for AMD r9-nano and Fiji - if (platform_name.find("AMD") == 0) { - skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) && - ((triple_dim.M < 1024 && triple_dim.N < 1024) || - (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K))); - } else { - skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) || - ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) || - ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100)); - } - if (skinny) - adjustTT<true, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim); - else -#endif // EIGEN_SYCL_DISABLE_SKINNY - adjustTT<false, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim); - } - } - - template <bool skinny, typename input_mapper_properties, typename LhsMapper, typename RhsMapper> - void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, - const TripleDim &triple_dim) const { -#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON - if (device().has_local_memory()) { - typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 16> PanelParameters; - launchTT<TensorSycl::internal::contraction_type::local, skinny, input_mapper_properties, PanelParameters>( - buffer, lhs, rhs, triple_dim); - } -#endif -#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF - if (!(device().has_local_memory())) { - typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 4> PanelParameters; - launchTT<TensorSycl::internal::contraction_type::no_local, skinny, input_mapper_properties, PanelParameters>( - buffer, lhs, rhs, triple_dim); - } -#endif - } - - template <TensorSycl::internal::contraction_type ct, bool skinny, typename input_mapper_properties, - typename Properties, typename LhsMapper, typename RhsMapper> - void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, - const TripleDim &triple_dim) const { - const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM); - const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN); - const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM; - const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN; - - const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK); - StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK; - StorageIndex groupSizeK = - skinny - ? std::max(std::min(totalTilesK, - (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) / - (groupSizeM * groupSizeN)), - StorageIndex(1)) - : StorageIndex(1); - - const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK; - - const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK; - - const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN; - const StorageIndex globalRange = totalGroupSize * localRange; - - const StorageIndex scratchSize = (ct == TensorSycl::internal::contraction_type::local) - ? ((Properties::DoubleBuffer + 1) * - (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) + - ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) * - (Properties::TileSizeDimN + Properties::BC)) - : StorageIndex(1); - - auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); - if (groupSizeK == 1) { - typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, - LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim, - PacketAccess, input_mapper_properties, true, ct> - ContractKernelName; - device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>( - lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim); - } else { - typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, - LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim, - PacketAccess, input_mapper_properties, false, ct> - ContractKernelName; - CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>( - device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType))); - EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); - - device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>( - lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, - triple_dim); - - typedef Eigen::internal::SumReducer<CoeffReturnType> Op; - auto op = Op(); - typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType, - EvaluatorPointerType, Op> - ReductionKernel; - - device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>( - tmp_global_accessor, buffer, - cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex( - Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))), - cl::sycl::range<1>(localRange)), - StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK); - - device().deallocate_temp(temp_pointer); - } - } - -#ifndef EIGEN_SYCL_DISABLE_GEMV - template <bool is_lhs_vec, typename VectorMapper, typename TensorMapper, typename StorageIndex> - void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat, - StorageIndex NC, StorageIndex C) const { - const StorageIndex nonContractDim = NC; - EIGEN_CONSTEXPR StorageIndex NCFactor = 1; - EIGEN_CONSTEXPR StorageIndex CFactor = 1; - EIGEN_CONSTEXPR StorageIndex NCWindow = 16; - typedef Eigen::TensorSycl::internal::TVPanelSize<CoeffReturnType, StorageIndex, NCWindow, CFactor, NCFactor> - Properties; - const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC); - const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC); - const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC); - const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC); - const StorageIndex globalRange = - (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC)); - const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC; - const StorageIndex scratchSize = - (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC; - auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); - if (cNumGroups > 1) { - typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper, - TensorMapper, StorageIndex, Properties, CFactor, false, - is_lhs_vec, false> - ContractKernelName; - CoeffReturnType *temp_pointer = - static_cast<CoeffReturnType *>(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType))); - EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); - - device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>( - vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C); - - typedef Eigen::internal::SumReducer<CoeffReturnType> Op; - typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType, - EvaluatorPointerType, Op> - ReductionKernel; - - device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>( - tmp_global_accessor, buffer, - cl::sycl::nd_range<1>(cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)), - cl::sycl::range<1>(localRange)), - StorageIndex(1), Op(), nonContractDim, cNumGroups); - - device().deallocate_temp(temp_pointer); - } else { - typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper, - TensorMapper, StorageIndex, Properties, CFactor, false, - is_lhs_vec, true> - ContractKernelName; - device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>( - vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C); - } - } -#endif - -#ifndef EIGEN_SYCL_DISABLE_SCALAR - template <typename LhsMapper, typename RhsMapper> - EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, - StorageIndex K) const { - EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) & - (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)), - "The Local thread size must be a power of 2 for the reduction " - "operation"); - EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1; - - // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread - // reduces at least 512 elementss individually, we get better performance. - const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1); - const StorageIndex global_range = num_work_group * local_range; - - typedef Eigen::TensorSycl::internal::GeneralScalarContraction< - CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false> - ContractKernelName; - auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); - if (num_work_group > 1) { - CoeffReturnType *temp_pointer = - static_cast<CoeffReturnType *>(device().allocate_temp(num_work_group * sizeof(CoeffReturnType))); - EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); - device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, tmp_global_accessor, - thread_range, local_range, K); - typedef Eigen::internal::SumReducer<CoeffReturnType> Op; - typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType, - EvaluatorPointerType, StorageIndex, local_range> - GenericRKernel; - device().template unary_kernel_launcher<CoeffReturnType, GenericRKernel>( - tmp_global_accessor, buffer, - cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op()); - - device().deallocate_temp(temp_pointer); - } else { - device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, buffer, thread_range, - local_range, K); - } - } -#endif - - EIGEN_STRONG_INLINE void cleanup() { - this->m_leftImpl.cleanup(); - this->m_rightImpl.cleanup(); - - if (this->m_result) { - this->m_device.deallocate_temp(this->m_result); - this->m_result = NULL; - } - } - // The placeholder accessors must bound to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - this->m_leftImpl.bind(cgh); - this->m_rightImpl.bind(cgh); - this->m_result.bind(cgh); - } -}; -} // namespace Eigen -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionThreadPool.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionThreadPool.h deleted file mode 100644 index 21be6ea..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionThreadPool.h +++ /dev/null @@ -1,1679 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H - -// evaluator for thread pool device -#ifdef EIGEN_USE_THREADS - -namespace Eigen { - -template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType> -struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> : - public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> > { - - typedef ThreadPoolDevice Device; - - typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self; - typedef TensorContractionEvaluatorBase<Self> Base; - - typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType; - typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - - enum { - Layout = TensorEvaluator<LeftArgType, Device>::Layout, - }; - - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - - static const int LDims = - internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value; - static const int RDims = - internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value; - static const int ContractDims = internal::array_size<Indices>::value; - - typedef array<Index, LDims> left_dim_mapper_t; - typedef array<Index, RDims> right_dim_mapper_t; - - typedef array<Index, ContractDims> contract_t; - typedef array<Index, LDims - ContractDims> left_nocontract_t; - typedef array<Index, RDims - ContractDims> right_nocontract_t; - - static const int NumDims = LDims + RDims - 2 * ContractDims; - - typedef DSizes<Index, NumDims> Dimensions; - - // typedefs needed in evalTo - typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; - typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; - typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits; - - typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; - typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; - - TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) {} - - template <int Alignment> - void evalProduct(Scalar* buffer) const { - evalProductImpl<NoCallback, Alignment>(buffer, NoCallback()); - } - - template <typename EvalToCallback, int Alignment> - void evalProductAsync(Scalar* buffer, EvalToCallback done) const { - evalProductImpl<EvalToCallback, Alignment>(buffer, std::move(done)); - } - - template <typename DoneCallback, int Alignment> - void evalProductImpl(Scalar* buffer, DoneCallback done) const { - // This function computes a lot of heuristics in multiple steps, and it - // also has multiple exit points. To keep it sane, readable and all in one - // place, sync/async execution decision is made at runtime at the very end. - // - // (1) In sync mode we allocate Context on the stack, submit computations - // to the device thread pool, and block on a barrier until it is - // completed. - // - // (2) In async mode we allocate Context on the heap, and after all tasks - // are finished, we call provided the done callback, and delete a - // context from the heap. - // - // (*) EvalParallelContext & EvalShardedByInnerDimContext owns all the state - // and temporary buffers, requried for executing the tensor contraction. - // They are responsible for cleaning it up after contraction is done. - static const bool IsEvalInSyncMode = - std::is_same<DoneCallback, NoCallback>::value; - - const Index m = this->m_i_size; - const Index n = this->m_j_size; - const Index k = this->m_k_size; - if (m == 0 || n == 0 || k == 0) return; - - // Compute a set of algorithm parameters: - // - kernel block sizes (bm, bn, bk) - // - task grain sizes (number of kernels executed per task: gm, gn) - // - number of threads - // - sharding by row/column - // - parallel packing or first lhs then rhs - // and some derived parameters: - // - number of tasks (nm, nn, nk) - // - number of kernels (nm0, nn0) - // Unfortunately, all these parameters are tightly interdependent. - // So in some cases we first compute approximate values, then compute other - // values based on these approximations and then refine the approximations. - - // There are lots of heuristics here. There is some reasoning behind them, - // but ultimately they are just tuned on contraction benchmarks for - // different input configurations, thread counts and instruction sets. - // So feel free to question any of them. - - // Compute whether we want to shard by row or by column. - // This is a first approximation, it will be refined later. Since we don't - // know number of threads yet we use 2, because what's we are most - // interested in at this point is whether it makes sense to use - // parallelization at all or not. - bool shard_by_col = shardByCol(m, n, 2); - - // First approximation of kernel blocking sizes. - // Again, we don't know number of threads yet, so we use 2. - Index bm, bn, bk; - if (shard_by_col) { - internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, - internal::ShardByCol> - blocking(k, m, n, 2); - bm = blocking.mc(); - bn = blocking.nc(); - bk = blocking.kc(); - } else { - internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, - internal::ShardByRow> - blocking(k, m, n, 2); - bm = blocking.mc(); - bn = blocking.nc(); - bk = blocking.kc(); - } - - // Compute optimal number of threads. - // Note: we use bk instead of k here because we are interested in amount of - // _parallelizable_ computations, and computations are not parallelizable - // across k dimension. - const TensorOpCost cost = - contractionCost(m, n, bm, bn, bk, shard_by_col, false); - int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( - static_cast<double>(n) * m, cost, this->m_device.numThreads()); - int num_threads_by_k = numThreadsInnerDim(m, n, k); - if (shardByInnerDim(m, n, k, num_threads, num_threads_by_k)) { - // We are in the scenario where it is more effective to shard by the - // inner dimension. - if (IsEvalInSyncMode) { - EvalShardedByInnerDimContext<DoneCallback> ctx( - this, num_threads_by_k, buffer, m, n, k, std::move(done)); - ctx.template run<Alignment>(); - } else { - auto* ctx = new EvalShardedByInnerDimContext<DoneCallback>( - this, num_threads_by_k, buffer, m, n, k, std::move(done)); - ctx->template runAsync<Alignment>(); - } - - return; - } - - // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost - // model is not tuned. Remove this when the cost model is tuned. - if (n == 1) num_threads = 1; - - if (num_threads == 1) { - TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, - Unaligned, (buffer)); - if (!IsEvalInSyncMode) done(); - return; - } - - // Now that we know number of threads, recalculate sharding and blocking. - shard_by_col = shardByCol(m, n, num_threads); - if (shard_by_col) { - internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, - internal::ShardByCol> - blocking(k, m, n, num_threads); - bm = blocking.mc(); - bn = blocking.nc(); - bk = blocking.kc(); - } else { - internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, - internal::ShardByRow> - blocking(k, m, n, num_threads); - bm = blocking.mc(); - bn = blocking.nc(); - bk = blocking.kc(); - } - - // Number of kernels for each dimension. - Index nm0 = divup(m, bm); - Index nn0 = divup(n, bn); - Index nk = divup(k, bk); - - // Calculate task grain size (number of kernels executed per task). - // This task size coarsening serves two purposes: - // 1. It reduces per-task overheads including synchronization overheads. - // 2. It allows to use caches better (reuse the same packed rhs in several - // consecutive kernels). - Index gm = 1; - Index gn = 1; - // If we are sharding by column, then we prefer to reduce rows first. - if (shard_by_col) { - gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); - gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); - } else { - gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); - gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); - } - // Number of tasks in each dimension. - Index nm = divup(nm0, gm); - Index nn = divup(nn0, gn); - - // If there is enough concurrency in the sharding dimension, we choose not - // to paralellize by the other dimension, and execute all kernels in sync - // mode. This reduces parallelism from the nm x nn down to nn - // (shard_by_col==true) or nm (shard_by_col==false). - const Index sharding_dim_tasks = shard_by_col ? nn : nm; - const int num_worker_threads = this->m_device.numThreadsInPool(); - - // With small number of threads we want to make sure that we do not reduce - // parallelism too much. With large number of threads we trade maximum - // parallelism for better memory locality. - const float oversharding_factor = - num_worker_threads <= 4 ? 8.0 : - num_worker_threads <= 8 ? 4.0 : - num_worker_threads <= 16 ? 2.0 : - num_worker_threads <= 32 ? 1.0 : - num_worker_threads <= 64 ? 0.8 : /* num_worker_threads > 64 */ 0.6; - - const bool parallelize_by_sharding_dim_only = - sharding_dim_tasks >= oversharding_factor * num_worker_threads; - - // Last by not least, decide whether we want to issue both lhs and rhs - // packing in parallel; or issue lhs packing first, and then issue rhs - // packing when lhs packing completes (for !shard_by_col lhs and rhs are - // swapped). Parallel packing allows more parallelism (for both packing and - // kernels), while sequential packing provides better locality (once - // a thread finishes rhs packing it proceed to kernels with that rhs). - // First, we are interested in parallel packing if there are few tasks. - bool parallel_pack = num_threads >= nm * nn; - // Also do parallel packing if all data fits into L2$. - if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <= - l2CacheSize() * num_threads) - parallel_pack = true; - // But don't do it if we will use each rhs only once. Locality seems to be - // more important in this case. - if ((shard_by_col ? nm : nn) == 1) parallel_pack = false; - // Also don't get in the way of parallelize_by_sharding_dim_only - // optimization. - if (parallelize_by_sharding_dim_only) parallel_pack = false; - - // TODO(ezhulnev): With if contexpr we don't need SyncEvalParallelContext. - if (IsEvalInSyncMode) { -#define CONTEXT_ARGS \ - (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \ - nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only, \ - NoCallback()) \ - .run() - TENSOR_CONTRACTION_DISPATCH(SyncEvalParallelContext, Alignment, - CONTEXT_ARGS); -#undef CONTEXT_ARGS - - } else { -#define CONTEXT_ARGS \ - (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \ - nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only, \ - std::move(done)) - TENSOR_CONTRACTION_ASYNC_DISPATCH(EvalParallelContext, DoneCallback, - Alignment, CONTEXT_ARGS, run()); -#undef CONTEXT_ARGS - } - } - - // ------------------------------------------------------------------------ // - - // Dummy struct to represent an empty DoneCallback. - - struct NoCallback { - void operator()() { - eigen_assert(false && "NoCallback should never be called"); - } - }; - - // ------------------------------------------------------------------------ // - - template <typename DoneCallback, typename Context> - class EvalParallelNotification; - - // Synchronous evaluation notification that blocks caller thread in Wait(). - template <typename Context> - class EvalParallelNotification<NoCallback, Context> { - public: - EvalParallelNotification(Context*, NoCallback) {} - void Notify() { done_.Notify(); } - void Wait() { done_.Wait(); } - private: - Eigen::Notification done_; - }; - - // Asynchronous evaluation notification that does not block in Wait(). - template <typename DoneCallback, typename Context> - class EvalParallelNotification { - public: - EvalParallelNotification(Context* ctx, DoneCallback done) - : ctx_(ctx), done_(std::move(done)) {} - - void Notify() { - // Make a copy of done callback, because it will be destructed when we - // will delete context in the next line (EvalParallelNotification is a - // data member of EvalParallelContext class). - DoneCallback done_copy = std::move(done_); - - // Delete parallel evaluation context. - delete ctx_; - - // Now safely call the done callback. - done_copy(); - } - - void Wait() {} - - private: - Context* ctx_; - DoneCallback done_; - }; - - // Context orchestrates sync/async parallel contraction evaluation. When it is - // executed in asynchronous mode, it owns all the shared state that might be - // accessible by block packing and kernel tasks. - - template <typename DoneCallback, bool lhs_inner_dim_contiguous, - bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, - int Alignment> - class EvalParallelContext { - public: - typedef internal::TensorContractionInputMapper< - LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t, - contract_t, internal::packet_traits<LhsScalar>::size, - lhs_inner_dim_contiguous, false, Unaligned> - LhsMapper; - typedef internal::TensorContractionInputMapper< - RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t, - contract_t, internal::packet_traits<RhsScalar>::size, - rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> - RhsMapper; - - typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; - - typedef internal::TensorContractionKernel< - Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper> - TensorContractionKernel; - - typedef typename TensorContractionKernel::LhsBlock LhsBlock; - typedef typename TensorContractionKernel::RhsBlock RhsBlock; - typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle; - - EvalParallelContext(const Self* self, int num_threads, Scalar* buffer, - Index tm, Index tn, Index tk, Index bm, Index bn, - Index bk, Index nm, Index nn, Index nk, Index gm, - Index gn, Index nm0, Index nn0, bool shard_by_col, - bool parallel_pack, - bool parallelize_by_sharding_dim_only, - DoneCallback done) - : created_by_thread_id_(std::this_thread::get_id()), - done_(this, std::move(done)), - device_(self->m_device), - lhs_(self->m_leftImpl, self->m_left_nocontract_strides, - self->m_i_strides, self->m_left_contracting_strides, - self->m_k_strides), - rhs_(self->m_rightImpl, self->m_right_nocontract_strides, - self->m_j_strides, self->m_right_contracting_strides, - self->m_k_strides), - buffer_(buffer), - output_(buffer, tm), - output_kernel_(self->m_output_kernel), - tensor_contraction_params_(self->m_tensor_contraction_params), - num_threads_(num_threads), - shard_by_col_(shard_by_col), - parallel_pack_(parallel_pack), - parallelize_by_sharding_dim_only_(parallelize_by_sharding_dim_only), - m_(tm), - n_(tn), - k_(tk), - bm_(bm), - bn_(bn), - bk_(bk), - nm_(nm), - nn_(nn), - nk_(nk), - gm_(gm), - gn_(gn), - nm0_(nm0), - nn0_(nn0), - kernel_(m_, k_, n_, bm_, bk_, bn_), - num_thread_local_allocations_(0), - // We reserve 2X more capacity for a thread local values, than the - // number of threads in the pool to efficiently handle task stealing - // by threads that are not managed by the pool. - thread_local_capacity(2 * (parallelize_by_sharding_dim_only_ - ? device_.numThreadsInPool() - : 0)), - // We will use only one of the Lhs/Rhs thread local storage depending - // on the shard_by_col value and we parallelize by sharding dim ONLY. - lhs_thread_local_blocks_(shard_by_col_ ? 0 : thread_local_capacity, - {*this}, {*this}), - rhs_thread_local_blocks_(shard_by_col_ ? thread_local_capacity : 0, - {*this}, {*this}) { - // These two options are mutually exclusive. - eigen_assert(!(parallel_pack && parallelize_by_sharding_dim_only)); - - for (Index x = 0; x < P; x++) { - // Normal number of notifications for k slice switch is - // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only - // nm_ + nn_ notifications, because they will not receive notifications - // from preceding kernels. - state_switch_[x] = - x == 0 - ? 1 - : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) + - (x == P - 1 ? nm_ * nn_ : 0); - state_packing_ready_[x] = - parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_); - state_kernel_[x] = new std::atomic<uint8_t>*[nm_]; - for (Index m = 0; m < nm_; m++) { - state_kernel_[x][m] = new std::atomic<uint8_t>[nn_]; - // Kernels generally receive 3 notifications (previous kernel + 2 - // packing), but the first slice won't get notifications from previous - // kernels. - for (Index n = 0; n < nn_; n++) - state_kernel_[x][m][n].store( - (x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1), - std::memory_order_relaxed); - } - } - - // Allocate memory for packed rhs/lhs matrices. - packed_mem_ = kernel_.allocateSlices( // - device_, // - /*num_lhs=*/nm0_, // - /*num_rhs=*/nn0_, // - /*num_slices=*/std::min<Index>(nk_, P - 1), // - packed_lhs_, packed_rhs_); - - if (parallelize_by_sharding_dim_only_) { - const int num_worker_threads = device_.numThreadsInPool(); - - if (shard_by_col) { - can_use_thread_local_packed_ = new std::atomic<bool>[nn_]; - for (int i = 0; i < nn_; ++i) - can_use_thread_local_packed_[i].store(true, - std::memory_order_relaxed); - - Index num_blocks = num_worker_threads * gn_; - thread_local_pre_alocated_mem_ = kernel_.allocateSlices( // - device_, // - /*num_lhs=*/0, // - /*num_rhs=*/num_blocks, // - /*num_slices=*/1, // - /*lhs_blocks=*/nullptr, &rhs_thread_local_pre_allocated_); - - } else { - can_use_thread_local_packed_ = new std::atomic<bool>[nm_]; - for (int i = 0; i < nm_; ++i) - can_use_thread_local_packed_[i].store(true, - std::memory_order_relaxed); - - Index num_blocks = num_worker_threads * gm_; - thread_local_pre_alocated_mem_ = kernel_.allocateSlices( // - device_, // - /*num_lhs=*/num_blocks, // - /*num_rhs=*/0, // - /*num_slices=*/1, &lhs_thread_local_pre_allocated_, // - /*rhs_blocks=*/nullptr); - } - } - } - - ~EvalParallelContext() { - for (Index x = 0; x < P; x++) { - for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m]; - delete[] state_kernel_[x]; - } - kernel_.deallocate(device_, packed_mem_); - if (parallelize_by_sharding_dim_only_) { - kernel_.deallocate(device_, thread_local_pre_alocated_mem_); - delete[] can_use_thread_local_packed_; - } - } - - void run() { - // Kick off packing of the first slice. - signal_switch(0, 1); - - // Wait for overall completion. - // - // If parallel evaluation is executed in async mode, this is a no-op, and - // Wait() will return immediately. In synchronous mode it will block the - // caller thread until it will receive notification from last task. - // - // In async mode, last task when completed will call done callback from - // the same thread, and will delete this context. - // - // TODO(dvyukov): This wait can lead to deadlock if contraction is - // evaluated in synchronous mode. If nthreads contractions are - // concurrently submitted from worker threads, this wait will block all - // worker threads and the system will deadlock. - done_.Wait(); - } - - private: - std::thread::id created_by_thread_id_; - - // This notification is specialized on the type of DoneCallback and can be - // blocking or non-blocking. - EvalParallelNotification<DoneCallback, EvalParallelContext> done_; - - const Device& device_; - LhsMapper lhs_; - RhsMapper rhs_; - Scalar* const buffer_; - OutputMapper output_; - OutputKernelType output_kernel_; - TensorContractionParams tensor_contraction_params_; - const int num_threads_; - const bool shard_by_col_; - const bool parallel_pack_; - const bool parallelize_by_sharding_dim_only_; - // Matrix sizes. - const Index m_; - const Index n_; - const Index k_; - // Block sizes. - const Index bm_; - const Index bn_; - const Index bk_; - // Number of tasks. - const Index nm_; - const Index nn_; - const Index nk_; - // Task grain sizes (number of kernels executed per task). - const Index gm_; - const Index gn_; - // Number of blocks (this is different from ni_/nn_ because of task size - // coarsening). - const Index nm0_; - const Index nn0_; - // Tensor contraction kernel. - TensorContractionKernel kernel_; - - // Parallelization strategy. - // - // Blocks related to the same k block can run in parallel because they write - // to different output blocks. So we parallelize within k slices, this - // gives us parallelism level of m x n. Before we can start any kernels - // related to k-th slice, we need to issue m lhs packing tasks and n rhs - // packing tasks. - // - // However, there is a bottleneck when we are finishing kernels for k-th - // slice (at the very end there is only 1 runnable kernel). To mitigate this - // bottleneck we allow kernels from k-th and k+1-th slices to run in - // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same - // output block, so they must not run in parallel. - // - // This gives us the following dependency graph. - // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs - // packing tasks. - // Kernel (m, n, k) can start when: - // - kernel (m, n, k-1) has finished - // - lhs packing (m, k) has finished - // - rhs packing (n, k) has finished - // Lhs/rhs packing can start when: - // - all k-1 packing has finished (artificially imposed to limit amount of - // parallel packing) - // - // On top of that we limit runnable tasks to two consecutive k slices. - // This is done to limit amount of memory we need for packed lhs/rhs - // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_). - // - // state_switch_ tracks when we are ready to switch to the next k slice. - // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n). - // These variable are rolling over 3 consecutive k slices: first two we are - // actively executing + one to track completion of kernels in the second - // slice. - static const Index P = 3; - - // Handle to the allocated temporary storage for Lhs/Rhs blocks. - BlockMemHandle packed_mem_; - std::vector<LhsBlock> packed_lhs_[P - 1]; - std::vector<RhsBlock> packed_rhs_[P - 1]; - - // If we choose to parallelize only by the sharding dimension, each thread - // will have it's own "thead local" (not a c++ thread local storage) memory - // for packed_lhs or packed_rhs (shard_by_col = false of true). This memory - // can't be passed to a kernel that might execute on a different thread. - // - // In practice when we are ready to pack memory for the sharding dimension - // (rhs if shard_by_col==true) of the K-th slice, all kernels for K-1 slice - // already computed (99% of the time), and we can pack data into the thread - // local storage, and guarantee that all the kernels will be executed - // immediately in the same thread. This significantly increases L1 cache hit - // ratio and reduces pressure on the memory bus. - // - // It's still possible that kernel for the K-th slice will be ready before - // completion of the K-1 kernel, so we have to allocate "global" packed_lhs_ - // and packed_rhs_ to allow kernels to be executed later on a thread - // different from the thread that was used for packing. - - // Handle for pre-allocated thread local memory buffers. - BlockMemHandle thread_local_pre_alocated_mem_; - - // Only one of these will be initialized depending on shard_by_col value - // (the size will be `num_worker_threads * num_grains_in_the_sharding_dim`). - std::vector<LhsBlock> lhs_thread_local_pre_allocated_; - std::vector<RhsBlock> rhs_thread_local_pre_allocated_; - - // How many thread local blocks were already allocated. - std::atomic<int> num_thread_local_allocations_; - const int thread_local_capacity; - - // We will use pre-allocated Lhs/Rhs blocks defined above, if the number of - // unique threads in a system is below or equal to the number of threads in - // a thread pool. We will fallback on dynamic memory allocation after that. - - // ThreadLocalBlocks is a container for Lhs or Rhs thread local buffers. Its - // size is equal to the grain size in Lhs/Rhs sharding dimension. - template <typename BlockType> - class ThreadLocalBlocks { - public: - ThreadLocalBlocks() = default; - - ThreadLocalBlocks(BlockType* base, size_t grain_size) - : is_pre_allocated_(true), - thread_local_pre_allocated_base_(base), - grain_size_(grain_size) {} - - ThreadLocalBlocks(BlockMemHandle mem_handle, - std::vector<BlockType> blocks) - : is_pre_allocated_(false), - mem_handle_(std::move(mem_handle)), - blocks_(std::move(blocks)) {} - - BlockType& block(int grain_index) { - eigen_assert(grain_index >= 0); - eigen_assert(static_cast<size_t>(grain_index) < size()); - return is_pre_allocated_ ? thread_local_pre_allocated_base_[grain_index] - : blocks_[grain_index]; - } - - void Release(EvalParallelContext& ctx) const { - if (!is_pre_allocated_) { - ctx.kernel_.deallocate(ctx.device_, mem_handle_); - } - } - - size_t size() const { - return is_pre_allocated_ ? grain_size_ : blocks_.size(); - } - - private: - bool is_pre_allocated_; - - // Reuse pre-allocated thread local buffers. - BlockType* thread_local_pre_allocated_base_ = nullptr; - size_t grain_size_ = 0; - - // These will be initialized only if `is_pre_allocated == false`. - BlockMemHandle mem_handle_{}; - std::vector<BlockType> blocks_; - }; - - // ThreadLocalBlocksInitialize callable does custom thread local blocks - // initialization, and will reuse pre-allocated buffers if possible, or will - // dynamically allocate new memory. - // - // Lhs/Rhs blocks might be of the same type, so we have to pass explicitly - // for what side do we plan to do block allocation. - template <typename BlockType, bool is_rhs> - class ThreadLocalBlocksInitialize { - static constexpr bool kIsLhs = - !is_rhs && std::is_same<BlockType, LhsBlock>::value; - static const bool kIsRhs = - is_rhs && std::is_same<BlockType, RhsBlock>::value; - static_assert(kIsLhs || kIsRhs, "Unkown block type"); - - using Blocks = ThreadLocalBlocks<BlockType>; - - public: - ThreadLocalBlocksInitialize(EvalParallelContext& ctx) - : ctx_(ctx), - num_worker_threads_(ctx_.device_.numThreadsInPool()) {} - - void operator()(Blocks& blocks) { - const int n = ctx_.num_thread_local_allocations_.fetch_add( - 1, std::memory_order_relaxed); - - if (n >= num_worker_threads_) { - ThreadLocalBlocksAllocator<is_rhs>::allocate(ctx_, blocks); - } else { - ThreadLocalBlocksAllocator<is_rhs>::reuse(ctx_, n, blocks); - } - } - - private: - // NOTE(ezhulenev): Without 'if constexpr' we have to put calls to - // TensorContractionKernel::allocateSlices into template specializations. - // Also explicit specializations are not allowed at class scope in C++03, - // EvalCtx type parameter is just a workaround for that limitation. - template <bool pack_rhs, typename EvalCtx = EvalParallelContext> - struct ThreadLocalBlocksAllocator; - - template <typename EvalCtx> - struct ThreadLocalBlocksAllocator</*pack_rhs=*/true, EvalCtx> { - static void allocate(EvalCtx& ctx, Blocks& blocks) { - std::vector<RhsBlock> rhs_blocks; - BlockMemHandle mem_handle = ctx.kernel_.allocateSlices( - ctx.device_, - /*num_lhs=*/0, - /*num_rhs=*/ctx.gn_, - /*num_slices=*/1, - /*lhs_blocks=*/nullptr, /*rhs_blocks=*/&rhs_blocks); - - blocks = ThreadLocalBlocks<RhsBlock>(std::move(mem_handle), - std::move(rhs_blocks)); - } - - static void reuse(EvalCtx& ctx, int index, Blocks& blocks) { - RhsBlock* ptr = &ctx.rhs_thread_local_pre_allocated_[ctx.gn_ * index]; - blocks = ThreadLocalBlocks<RhsBlock>(ptr, ctx.gn_); - } - }; - - template <typename EvalCtx> - struct ThreadLocalBlocksAllocator</*pack_rhs=*/false, EvalCtx> { - static void allocate(EvalCtx& ctx, Blocks& blocks) { - std::vector<LhsBlock> lhs_blocks; - BlockMemHandle mem_handle = ctx.kernel_.allocateSlices( - ctx.device_, - /*num_lhs=*/ctx.gm_, - /*num_rhs=*/0, - /*num_slices=*/1, - /*lhs_blocks=*/&lhs_blocks, /*rhs_blocks=*/nullptr); - - blocks = ThreadLocalBlocks<LhsBlock>(std::move(mem_handle), - std::move(lhs_blocks)); - } - - static void reuse(EvalCtx& ctx, int index, Blocks& blocks) { - LhsBlock* ptr = &ctx.lhs_thread_local_pre_allocated_[ctx.gm_ * index]; - blocks = ThreadLocalBlocks<LhsBlock>(ptr, ctx.gm_); - } - }; - - EvalParallelContext& ctx_; - const int num_worker_threads_; - }; - - template <typename BlockType> - class ThreadLocalBlocksRelease { - public: - using Blocks = ThreadLocalBlocks<BlockType>; - ThreadLocalBlocksRelease(EvalParallelContext& ctx) : ctx_(ctx) {} - void operator()(Blocks& blocks) { blocks.Release(ctx_); } - - private: - EvalParallelContext& ctx_; - }; - - // ThreadLocalBlocks initialization callables. - using ThreadLocalLhsInit = - ThreadLocalBlocksInitialize<LhsBlock, /*is_rhs=*/false>; - using ThreadLocalRhsInit = - ThreadLocalBlocksInitialize<RhsBlock, /*is_rhs=*/true>; - - // ThreadLocalBlocks release callables. - using ThreadLocalLhsRelease = ThreadLocalBlocksRelease<LhsBlock>; - using ThreadLocalRhsRelease = ThreadLocalBlocksRelease<RhsBlock>; - - // Thread local containers for Lhs/Rhs block packs. In practice only one of - // them will be used, depending on the shard_by_col value. - Eigen::ThreadLocal<ThreadLocalBlocks<LhsBlock>, ThreadLocalLhsInit, - ThreadLocalLhsRelease> - lhs_thread_local_blocks_; - Eigen::ThreadLocal<ThreadLocalBlocks<RhsBlock>, ThreadLocalRhsInit, - ThreadLocalRhsRelease> - rhs_thread_local_blocks_; - - // After a particular shard for Kth slice missed thread local execution - // opportunity (K-1 slice didn't complete kernels execution), we can no - // longer schedule K+1 and following slices in thread local mode, because - // there is no more guarantee that previous kernels were executed - // sequentially in the same thread (size is nn_ or nm_). - std::atomic<bool>* can_use_thread_local_packed_; - - std::atomic<uint8_t>** state_kernel_[P]; - // state_switch_ is frequently modified by worker threads, while other - // fields are read-only after constructor. Let's move it to a separate cache - // line to reduce cache-coherency traffic. - char pad_[128]; - std::atomic<Index> state_packing_ready_[P]; - std::atomic<Index> state_switch_[P]; - - LhsBlock& packed_lhs(Index m, Index k, Index m1, bool use_thread_local) { - if (use_thread_local) { - eigen_assert(!shard_by_col_); - ThreadLocalBlocks<LhsBlock>& blocks = lhs_thread_local_blocks_.local(); - - Index grain_index = m1 - m * gm_; - return blocks.block(internal::convert_index<int>(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index? - } else { - return packed_lhs_[k % (P - 1)][m1]; - } - } - - RhsBlock& packed_rhs(Index n, Index k, Index n1, bool use_thread_local) { - if (use_thread_local) { - eigen_assert(shard_by_col_); - ThreadLocalBlocks<RhsBlock>& blocks = rhs_thread_local_blocks_.local(); - - Index grain_index = n1 - n * gn_; - return blocks.block(internal::convert_index<int>(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index? - } else { - return packed_rhs_[k % (P - 1)][n1]; - } - } - - // In following two methods (pack_lhs and pack_rhs), if we know for sure - // that we'll be able to immediately call a kernel with packed data, and do - // not submit it to the thread pool, we can use thread local memory for - // packed data. - // - // We can only reliably check it if we are running all kernels in sync mode - // (parallelize only by sharding dim). If kernel for m==0 (n==0) is ready to - // run, it's guaranteed that all kernels with larger values of m (n) are - // also ready, because we execute them in the same order for all K slices. - - void pack_lhs(Index m, Index k) { - bool use_thread_local = false; - - if (parallelize_by_sharding_dim_only_ && !shard_by_col_ && - can_use_thread_local_packed_[m].load(std::memory_order_relaxed)) { - if (state_kernel_[k % P][m][0].load(std::memory_order_relaxed) == 1) { - use_thread_local = true; - } else { - // If we can't guarantee that all kernels in `k` slice will be - // executed sequentially in current thread, it's no longer safe to use - // thread local memory in following slices along the k dimensions. - eigen_assert(k > 0); - can_use_thread_local_packed_[m].store(false, - std::memory_order_relaxed); - } - } - - const Index mend = m * gm_ + gm(m); - for (Index m1 = m * gm_; m1 < mend; m1++) - kernel_.packLhs(&packed_lhs(m, k, m1, use_thread_local), - lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); - - if (!parallel_pack_ && shard_by_col_) { - assert(!use_thread_local); - signal_packing(k); - } else { - signal_switch(k + 1); - for (Index n = nn_ - 1; n >= 0; n--) { - bool sync = parallelize_by_sharding_dim_only_ || n == 0; - signal_kernel(m, n, k, sync, use_thread_local); - } - } - } - - void pack_rhs(Index n, Index k) { - bool use_thread_local = false; - - if (parallelize_by_sharding_dim_only_ && shard_by_col_ && - can_use_thread_local_packed_[n].load(std::memory_order_relaxed)) { - if (state_kernel_[k % P][0][n].load(std::memory_order_relaxed) == 1) { - use_thread_local = true; - } else { - // If we can't guarantee that all kernels in `k` slice will be - // executed sequentially in current thread, it's no longer safe to use - // thread local memory in followig slices along the k dimensions. - eigen_assert(k > 0); - can_use_thread_local_packed_[n].store(false, - std::memory_order_relaxed); - } - } - - const Index nend = n * gn_ + gn(n); - for (Index n1 = n * gn_; n1 < nend; n1++) { - if (!TensorContractionKernel::HasBeta && k == 0) { - // Zero the output memory in parallel, only if contraction kernel does - // not support `beta`. Otherwise we will pass beta 0.0 to the first - // call to the `TensorContractionKernel::invoke()`. - // - // On 10000x2x10000 mm zeroing can easily take half of time. Zero (bn - // x m) row. Safe to do here because all kernels that will write to - // this memory depend on completion of this task. Note: don't call - // device_.memset() here. device_.memset() blocks on thread pool - // worker thread, which can lead to underutilization and deadlocks. - memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar)); - } - kernel_.packRhs(&packed_rhs(n, k, n1, use_thread_local), - rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); - } - - if (parallel_pack_ || shard_by_col_) { - signal_switch(k + 1); - for (Index m = nm_ - 1; m >= 0; m--) { - bool sync = parallelize_by_sharding_dim_only_ || m == 0; - signal_kernel(m, n, k, sync, use_thread_local); - } - } else { - assert(!use_thread_local); - signal_packing(k); - } - } - - void kernel(Index m, Index n, Index k, bool use_thread_local) { - // Note: order of iteration matters here. Iteration over m is innermost - // because we want to reuse the same packed rhs in consecutive tasks - // (rhs fits into L2$ while lhs only into L3$). - const Index nend = n * gn_ + gn(n); - const Index mend = m * gm_ + gm(m); - - // NOTE: output = alpha * LHS * RHS + beta * output. - const Scalar alpha = Scalar(1); - const Scalar beta = - (TensorContractionKernel::HasBeta && k == 0) ? Scalar(0) : Scalar(1); - - if (shard_by_col_) { - for (Index n1 = n * gn_; n1 < nend; n1++) { - for (Index m1 = m * gm_; m1 < mend; m1++) { - const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); - kernel_.invoke( - output_mapper, - packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local), - packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1), - bk(k), bn(n1), alpha, beta); - - // We are done with the last task for the [m1, n1] block. - if (k + 1 == nk_) { - output_kernel_(output_mapper, tensor_contraction_params_, - m1 * bm_, n1 * bn_, bm(m1), bn(n1)); - } - } - } - } else { - for (Index m1 = m * gm_; m1 < mend; m1++) - for (Index n1 = n * gn_; n1 < nend; n1++) { - const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); - kernel_.invoke( - output_mapper, - packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local), - packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1), - bk(k), bn(n1), alpha, beta); - - // We are done with the last task for the [m1, n1] block. - if (k + 1 == nk_) { - output_kernel_(output_mapper, tensor_contraction_params_, - m1 * bm_, n1 * bn_, bm(m1), bn(n1)); - } - } - } - signal_kernel(m, n, k + 1, /*sync=*/false, /*use_thread_local=*/false); - signal_switch(k + 2); - } - - void signal_packing(Index k) { - eigen_assert(!parallel_pack_); - Index s = state_packing_ready_[k % P].fetch_sub(1); - eigen_assert(s > 0); - if (s != 1) return; - state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_; - enqueue_packing(k, shard_by_col_); - } - - void signal_kernel(Index m, Index n, Index k, bool sync, - bool use_thread_local) { - std::atomic<uint8_t>* state = &state_kernel_[k % P][m][n]; - Index s = state->load(); - eigen_assert(s > 0); - if (s != 1 && state->fetch_sub(1) != 1) { - eigen_assert(!use_thread_local); - return; - } - state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed); - if (sync) { - kernel(m, n, k, use_thread_local); - } else { - eigen_assert(!use_thread_local); - device_.enqueueNoNotification( - [=]() { kernel(m, n, k, use_thread_local); }); - } - } - - void signal_switch(Index k, Index v = 1) { - Index s = state_switch_[k % P].fetch_sub(v); - eigen_assert(s >= v); - if (s != v) return; - - // Ready to switch to the next k slice. - // Reset counter for the next iteration. - state_switch_[k % P] = - (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) + - nm_ * nn_; - if (k < nk_) { - // Issue lhs/rhs packing. Their completion will in turn kick off - // kernels. - if (parallel_pack_) { - enqueue_packing(k, !shard_by_col_); - enqueue_packing(k, shard_by_col_); - } else if (shard_by_col_) { - enqueue_packing(k, false); - } else { - enqueue_packing(k, true); - } - - // Termination handling. - // Because kernel completion signals k + 2 switch, we need to finish nk - // + 2 slices without issuing any tasks on nk + 1 slice. So here we - // pretend that all nk + 1 packing tasks just finish instantly; so that - // nk + 2 switch only waits for completion of nk kernels. - } else if (k == nk_) { - signal_switch(k + 1, - parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)); - } else { - done_.Notify(); - } - } - - // Enqueue all rhs/lhs packing for k-th slice. - void enqueue_packing(Index k, bool rhs) { - enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs); - } - - void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) { - if (end - start == 1) { - if (rhs) - pack_rhs(start, k); - else - pack_lhs(start, k); - } else { - while (end - start > 1) { - Index mid = (start + end) / 2; - device_.enqueueNoNotification( - [=]() { enqueue_packing_helper(mid, end, k, rhs); }); - end = mid; - } - - // Decide if we want to run first packing task (start == 0) in - // async mode if we parallelize only by sharding dim: - // (1) pack_lhs and pack_rhs call signal_switch before completing - // all calls to signal_kernel, which in sync mode might lead - // to the execution of the first kernel of the k+1 slice, before - // completing a call to the last kernel of the k slice. - // (2) all pack tasks for sharded dim must be executed in a thread - // pool to get pre-allocated thead local buffers. - bool pack_async = - (start == 0) && - (parallelize_by_sharding_dim_only_&& shard_by_col_ == rhs) && - (k > 0 || std::this_thread::get_id() == created_by_thread_id_); - - if (pack_async) { - device_.enqueueNoNotification( - [=]() { enqueue_packing_helper(start, end, k, rhs); }); - } else { - enqueue_packing_helper(start, end, k, rhs); - } - } - } - - // Block sizes with accounting for potentially incomplete last block. - Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; } - Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; } - Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; } - // Task grain sizes accounting for potentially incomplete last task. - Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; } - Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; } - - EvalParallelContext(const EvalParallelContext&) = delete; - void operator=(const EvalParallelContext&) = delete; - }; - - template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, - bool rhs_inner_dim_reordered, int Alignment> - using SyncEvalParallelContext = - EvalParallelContext<NoCallback, lhs_inner_dim_contiguous, - rhs_inner_dim_contiguous, rhs_inner_dim_reordered, - Alignment>; - - // ------------------------------------------------------------------------ // - - // EvalShardedByInnerDimContext orchestrates sync/async contraction - // evaluation, when we shard by inner dimension. When it is executed in - // asynchronous mode, it owns all the shared state that might be accessible by - // block processing tasks. - - template <typename DoneCallback> - struct EvalShardedByInnerDimContext { - EvalShardedByInnerDimContext(const Self* self, int num_threads, - Scalar* result_buffer, - Index m_size, Index n_size, Index k_size, - DoneCallback done_callback) - : evaluator(self), - m_lhs_inner_dim_contiguous(evaluator->m_lhs_inner_dim_contiguous), - m_rhs_inner_dim_contiguous(evaluator->m_rhs_inner_dim_contiguous), - m_rhs_inner_dim_reordered(evaluator->m_rhs_inner_dim_reordered), - result(result_buffer), - m(m_size), - n(n_size), - k(k_size), - done(std::move(done_callback)), - buffer_size_bytes(m * n * sizeof(Scalar)), - block_size(blockSize(k, num_threads)), - num_blocks(divup<Index>(k, block_size)), - num_pending_blocks(internal::convert_index<int>(num_blocks)), - l0_ranges(divup<Index>(num_blocks, l0_size)), - l0_state(l0_ranges), - block_buffers(num_blocks) { - // Keep count of pending gemm tasks for each l0 range. - for (int i = 0; i < l0_ranges; ++i) { - const Index num_pending_tasks = actualRangeSize(l0_ranges, l0_size, i); - l0_state.emplace_back(internal::convert_index<int>(num_pending_tasks)); - } - - // Allocate temporary buffers for each block. - for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) { - Scalar* buf = block_idx == 0 - ? result - : static_cast<Scalar*>(evaluator->m_device.allocate( - buffer_size_bytes)); - block_buffers.emplace_back(buf); - } - } - - ~EvalShardedByInnerDimContext() { - for (Index i = 1; i < num_blocks; ++i) { - evaluator->m_device.deallocate(block_buffers[i]); - } - } - - template <int Alignment> - void run() { - Barrier barrier(internal::convert_index<int>(num_blocks)); - eval<Alignment>(barrier, 0, num_blocks); - barrier.Wait(); - - // Aggregate partial sums from l0 ranges. - aggregateL0Blocks<Alignment>(); - - // Apply output kernel. - applyOutputKernel(); - } - - template <int Alignment> - void runAsync() { - evalAsync<Alignment>(0, num_blocks); - } - - private: - // The underlying GEMM kernel assumes that k is a multiple of - // the packet size and subtle breakage occurs if this is violated. - static const Index packet_size = internal::packet_traits<RhsScalar>::size; - - const Self* evaluator; // TensorContraction evaluator - - // These fields required fromTENSOR_CONTRACTION_DISPATCH macro. - bool m_lhs_inner_dim_contiguous; - bool m_rhs_inner_dim_contiguous; - bool m_rhs_inner_dim_reordered; - - Scalar* result; - - Index m; - Index n; - Index k; - - DoneCallback done; - - // ----------------------------------------------------------------------// - // Algorithm parameters. - - // We will compute partial results into the buffers of this size. - Index buffer_size_bytes; - - Index block_size; - Index num_blocks; - - // Keep track of pending tasks when evaluate in async mode. - std::atomic<int> num_pending_blocks; - - // We compute partial gemm results in parallel, and to get the final result - // we need to add them all together. For the large number of threads (>= 48) - // this adds a very expensive sequential step at the end. - // - // We split the [0, num_blocks) into small ranges, and when a task for the - // block finishes its partial gemm computation, it checks if it was the last - // gemm in the range, and if so, it will add all blocks of the range. - // - // After all tasks done, we need to add only these pre-aggregated blocks. - - // For now we use just a single level of ranges to compute pre-aggregated - // partial sums, but in general we can use more layers to compute tree - // aggregation in parallel and reduce the size of the sequential step. - // - // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make - // sense only if number of threads >= ~128? - static const Index l0_size = 4; - Index l0_ranges; - - // Keep count of pending gemm tasks for each l0 range. - MaxSizeVector<std::atomic<int>> l0_state; // [0, l0_ranges) - - // Buffers allocated for each temporary block computation. - MaxSizeVector<Scalar*> block_buffers; // [0, num_blocks) - - template <int Alignment> - void processBlock(Index block_idx, Index begin, Index end) { - Scalar* buf = block_buffers[block_idx]; - - TENSOR_CONTRACTION_DISPATCH( - evaluator->template evalGemmPartialWithoutOutputKernel, Alignment, - (buf, begin, end, - /*num_threads=*/internal::convert_index<int>(num_blocks))); - - // Check if it was the last task in l0 range. - const Index l0_index = block_idx / l0_size; - const int v = l0_state[l0_index].fetch_sub(1); - eigen_assert(v >= 1); - - // If we processed the last block of the range, we can aggregate all - // partial results into the first block of the range. - if (v == 1) { - const Index rng_size = actualRangeSize(l0_ranges, l0_size, l0_index); - const Index dst_block_idx = l0_index * l0_size; - - if (rng_size == l0_size) { - addAllToBuffer<Alignment>( - m * n, - /*src_buf0=*/block_buffers[dst_block_idx + 1], - /*src_buf1=*/block_buffers[dst_block_idx + 2], - /*src_buf2=*/block_buffers[dst_block_idx + 3], - /*dst_buf= */ block_buffers[dst_block_idx]); - } else { - // Aggregate blocks of potentially incomplete last range. - for (int i = 1; i < rng_size; ++i) { - addToBuffer<Alignment>(m * n, - /*src_buf=*/block_buffers[dst_block_idx + i], - /*dst_buf=*/block_buffers[dst_block_idx]); - } - } - } - } - - // Aggregate partial sums from l0 ranges. - template <int Alignment> - void aggregateL0Blocks() const { - Index l0_index = 1; - - for (; l0_index + 2 < l0_ranges; l0_index += 3) { - addAllToBuffer<Alignment>( - m * n, - /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size], - /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size], - /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size], - /*dst_buf= */ block_buffers[0]); - } - - for (; l0_index < l0_ranges; ++l0_index) { - addToBuffer<Alignment>(m * n, block_buffers[l0_index * l0_size], - block_buffers[0]); - } - } - - void applyOutputKernel() const { - typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; - evaluator->m_output_kernel( - OutputMapper(result, m), evaluator->m_tensor_contraction_params, - static_cast<Eigen::Index>(0), static_cast<Eigen::Index>(0), m, n); - } - - // Compute block size with accounting for potentially incomplete last block. - Index actualBlockSize(Index block_idx) const { - return block_idx + 1 < num_blocks - ? block_size - : k + block_size - block_size * num_blocks; - }; - - // Compute range size with accounting for potentially incomplete last range. - Index actualRangeSize(Index num_ranges, Index range_size, - Index range_idx) const { - eigen_assert(range_idx < num_ranges); - return range_idx + 1 < num_ranges - ? range_size - : num_blocks + range_size - range_size * num_ranges; - }; - - template <int Alignment> - EIGEN_STRONG_INLINE static void addToBuffer(size_t n, const Scalar* src_buf, - Scalar* tgt_buf) { - const int output_packet_size = - internal::unpacket_traits<PacketReturnType>::size; - size_t i = 0; - const size_t num_packets = n / output_packet_size; - for (; i < output_packet_size * num_packets; i += output_packet_size) { - const PacketReturnType src_val = - internal::pload<PacketReturnType>(src_buf + i); - const PacketReturnType tgt_val = - internal::ploadt<PacketReturnType, Alignment>(tgt_buf + i); - const PacketReturnType sum = internal::padd(src_val, tgt_val); - internal::pstoret<Scalar, PacketReturnType, Alignment>(tgt_buf + i, - sum); - } - for (; i < n; ++i) { - tgt_buf[i] += src_buf[i]; - } - } - - template <int Alignment> - EIGEN_STRONG_INLINE static void addAllToBuffer(size_t n, - const Scalar* src_buf0, - const Scalar* src_buf1, - const Scalar* src_buf2, - Scalar* dst_buf) { - using ::Eigen::internal::padd; - using ::Eigen::internal::pload; - using ::Eigen::internal::ploadt; - using ::Eigen::internal::pstoret; - - const int output_packet_size = - internal::unpacket_traits<PacketReturnType>::size; - - size_t i = 0; - const size_t num_packets = n / output_packet_size; - for (; i < output_packet_size * num_packets; i += output_packet_size) { - const auto src_val0 = pload<PacketReturnType>(src_buf0 + i); - const auto src_val1 = pload<PacketReturnType>(src_buf1 + i); - const auto src_val2 = pload<PacketReturnType>(src_buf2 + i); - - const auto dst_val = ploadt<PacketReturnType, Alignment>(dst_buf + i); - const auto sum = - padd(padd(dst_val, src_val0), padd(src_val1, src_val2)); - - pstoret<Scalar, PacketReturnType, Alignment>(dst_buf + i, sum); - } - for (; i < n; ++i) { - dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i]; - } - } - - template <int Alignment> - void eval(Barrier& barrier, Index start_block_idx, Index end_block_idx) { - while (end_block_idx - start_block_idx > 1) { - Index mid_block_idx = (start_block_idx + end_block_idx) / 2; - evaluator->m_device.enqueueNoNotification( - [this, &barrier, mid_block_idx, end_block_idx]() { - eval<Alignment>(barrier, mid_block_idx, end_block_idx); - }); - end_block_idx = mid_block_idx; - } - - Index block_idx = start_block_idx; - Index block_start = block_idx * block_size; - Index block_end = block_start + actualBlockSize(block_idx); - - processBlock<Alignment>(block_idx, block_start, block_end); - barrier.Notify(); - } - - template <int Alignment> - void evalAsync(Index start_block_idx, Index end_block_idx) { - while (end_block_idx - start_block_idx > 1) { - Index mid_block_idx = (start_block_idx + end_block_idx) / 2; - evaluator->m_device.enqueueNoNotification( - [this, mid_block_idx, end_block_idx]() { - evalAsync<Alignment>(mid_block_idx, end_block_idx); - }); - end_block_idx = mid_block_idx; - } - - Index block_idx = start_block_idx; - - Index block_start = block_idx * block_size; - Index block_end = block_start + actualBlockSize(block_idx); - - processBlock<Alignment>(block_idx, block_start, block_end); - - int v = num_pending_blocks.fetch_sub(1); - eigen_assert(v >= 1); - - if (v == 1) { - // Aggregate partial sums from l0 ranges. - aggregateL0Blocks<Alignment>(); - - // Apply output kernel. - applyOutputKernel(); - - // NOTE: If we call `done` callback before deleting this (context), - // it might deallocate Self* pointer captured by context, and we'll - // fail in destructor trying to deallocate temporary buffers. - - // Move done call back from context before it will be destructed. - DoneCallback done_copy = std::move(done); - - // We are confident that we are the last one who touches context. - delete this; - - // Now safely call the done callback. - done_copy(); - } - } - - // Cost model doesn't capture well the cost associated with constructing - // tensor contraction mappers and computing loop bounds in gemm_pack_lhs - // and gemm_pack_rhs, so we specify minimum desired block size. - static Index blockSize(Index k, int num_threads) { - const auto round_up = [=](Index index) -> Index { - const Index kmultiple = packet_size <= 8 ? 8 : packet_size; - return divup<Index>(index, kmultiple) * kmultiple; - }; - - const Index target_block_size = round_up(divup<Index>(k, num_threads)); - const Index desired_min_block_size = 12 * packet_size; - - return numext::mini<Index>( - k, numext::maxi<Index>(desired_min_block_size, target_block_size)); - } - - EvalShardedByInnerDimContext(const EvalShardedByInnerDimContext&) = delete; - void operator=(const EvalShardedByInnerDimContext&) = delete; - }; - - // ------------------------------------------------------------------------ // - - // Below are the function used by evalProductImpl heuristics, trying to select - // optimcal parameters for parallelization algorithm. - - // Decide whether we want to shard m x n contraction by columns or by rows. - static bool shardByCol(Index m, Index n, Index num_threads) { - // Note: we are comparing both n and m against Traits::nr, it is not - // a mistake. We are trying to figure out how both n and m will fit into - // the main sharding dimension. - - // Sharding by column is the default - // ... unless there is enough data for vectorization over rows - if (m / num_threads >= Traits::nr && - // and not enough data for vectorization over columns - (n / num_threads < Traits::nr || - // ... or barely enough data for vectorization over columns, - // but it is not evenly dividable across threads - (n / num_threads < 4 * Traits::nr && - (n % (num_threads * Traits::nr)) != 0 && - // ... and it is evenly dividable across threads for rows - ((m % (num_threads * Traits::nr)) == 0 || - // .. or it is not evenly dividable for both dimensions but - // there is much more data over rows so that corner effects are - // mitigated. - (m / n >= 6))))) - return false; - // Wait, or if matrices are just substantially prolonged over the other - // dimension. - if (n / num_threads < 16 * Traits::nr && m > n * 32) return false; - return true; - } - - Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn, - int num_threads, bool shard_by_col) const { - Index gm = 1; - Index gm1 = 1; - Index nm0 = divup(m, bm); - Index nm1 = nm0; - for (;;) { - // Find the next candidate for m grain size. It needs to result in - // different number of blocks. E.g. if we have 10 kernels, we want to try - // 5 and 10, but not 6, 7, 8 and 9. - while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++; - if (gm1 > nm0) break; - // Check the candidate. - int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads, - shard_by_col); - if (res < 0) break; - nm1 = divup(nm0, gm1); - if (res == 0) continue; - // Commit new grain size. - gm = gm1; - } - return gm; - } - - Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm, - int num_threads, bool shard_by_col) const { - Index gn = 1; - Index gn1 = 1; - Index nn0 = divup(n, bn); - Index nn1 = nn0; - for (;;) { - while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++; - if (gn1 > nn0) break; - int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads, - shard_by_col); - if (res < 0) break; - nn1 = divup(nn0, gn1); - if (res == 0) continue; - gn = gn1; - } - return gn; - } - - // checkGrain checks whether grain (gm, gn) is suitable and is better than - // (oldgm, oldgn). - int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm, - Index gn, Index oldgm, Index oldgn, int num_threads, - bool shard_by_col) const { - const TensorOpCost cost = - contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true); - double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize( - static_cast<double>(bm) * gm * bn * gn, cost); - // If the task is too small, then we agree on it regardless of anything - // else. Otherwise synchronization overheads will dominate. - if (taskSize < 1) return 1; - // If it is too large, then we reject it and all larger tasks. - if (taskSize > 2) return -1; - // Now we are in presumably good task size range. - // The main deciding factor here is parallelism. Consider that we have 12 - // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes. - // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4 - // of cores will be busy). While grain size 3 gives us 4 tasks, which gives - // us parallelism of 1 (we can load all cores). - Index nm0 = divup(m, bm); - Index nn0 = divup(n, bn); - Index new_tasks = divup(nm0, gm) * divup(nn0, gn); - double new_parallelism = static_cast<double>(new_tasks) / - (divup<int>(new_tasks, num_threads) * num_threads); - Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn); - double old_parallelism = static_cast<double>(old_tasks) / - (divup<int>(old_tasks, num_threads) * num_threads); - if (new_parallelism > old_parallelism || new_parallelism == 1) return 1; - return 0; - } - - TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk, - bool shard_by_col, bool prepacked) const { - const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size, - PacketType<RhsScalar, Device>::size); - const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size; - const double kd = static_cast<double>(bk); - double compute_bandwidth = computeBandwidth(false, bm, bn, bk); - // Computations. - TensorOpCost cost = TensorOpCost(0, 0, kd * compute_bandwidth, true, packed_size); - // Output stores. - cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); - if (prepacked) { - // Packing and kernels are executed in different tasks. When we calculate - // task grain size we look only at kernel cost assuming that kernel - // is more expensive than packing. - return cost; - } - // Lhs/rhs loads + computations. - TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n); - TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m); - // Lhs packing memory cost does not contribute considerably to overall - // execution time because lhs is prefetched early and accessed sequentially. - if (shard_by_col) - lhsCost.dropMemoryCost(); - else - rhsCost.dropMemoryCost(); - return cost + lhsCost + rhsCost; - } - - // Decide whether we want to shard m x k x n contraction over the inner - // (contraction) dimension (k). - static bool shardByInnerDim(Index m, Index n, Index k, int num_threads, - int num_threads_by_k) { - std::ptrdiff_t bufsize = m * n * sizeof(Scalar); - bool shard_by_k = false; - if (n == 1 || // If mat*vec or... - num_threads_by_k < 2 || // running single threaded or... - num_threads_by_k < - num_threads || // sharding by k gives less parallelism or... - bufsize > l3CacheSize() / num_threads_by_k || // need more buffer space - // than L3 cache or... - k / num_threads_by_k < 2 * Traits::nr) { // k per thread is tiny. - shard_by_k = false; - } else if (numext::maxi(m, n) / num_threads < - Traits::nr || // both other dimensions are tiny or... - // k per thread is not small and... - (k / num_threads_by_k > 8 * Traits::nr && - // one of the outer dimensions is tiny or sharding by k offers - // more parallelism. - (numext::mini(m, n) < 2 * Traits::nr || - num_threads_by_k > num_threads))) { - shard_by_k = true; - } - return shard_by_k; - } - - TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const { - // Compute cost. - const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size; - TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n, true, output_packet_size); - // Output stores. - cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); - TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * m; - TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * n; - // Since the inner gemm kernel is always sharded by column, the lhs - // load cost is negligible. - lhsCost.dropMemoryCost(); - return cost + lhsCost + rhsCost; - } - - int numThreadsInnerDim(Index m, Index n, Index k) const { - const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size; - TensorOpCost cost = contractionCostPerInnerDim(m, n, k); - double total_parallel_cost = - TensorCostModel<ThreadPoolDevice>::totalCost(k, cost); - // Cost of reduction step accumulating the m*n per-thread buffers into the - // result. - double reduction_cost = TensorCostModel<ThreadPoolDevice>::totalCost( - m * n, TensorOpCost(2, 1, 1, true, output_packet_size)); - int num_threads = 1; - double min_cost = total_parallel_cost; - double kPerThreadOverHead = 3000; - double kFixedOverHead = 100000; - for (int nt = 2; nt <= this->m_device.numThreads(); nt += 2) { - double sequential_cost = - kFixedOverHead + nt * (reduction_cost + kPerThreadOverHead); - double parallel_cost = total_parallel_cost / nt + sequential_cost; - if (parallel_cost < min_cost) { - num_threads = nt; - min_cost = parallel_cost; - } - } - return num_threads; - } - - double computeBandwidth(bool shard_by_col, Index bm, Index bn, - Index bk) const { - // Peak VFMA bandwidth is 0.5. However if we have not enough data for - // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined - // experimentally. - double computeBandwidth = - bk == 1 ? 4.0 - : (shard_by_col ? bn : bm) < Traits::nr || - (shard_by_col ? bm : bn) < Traits::mr - ? 2.0 - : 0.5; -#ifndef EIGEN_VECTORIZE_FMA - // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors. - // However for MULPS/ADDPS we have dependent sequence of 2 such - // instructions, - // so overall bandwidth is 1.0. - if (computeBandwidth == 0.5) computeBandwidth = 1.0; -#endif - return computeBandwidth; - } - -}; - -} // end namespace Eigen - -#endif // EIGEN_USE_THREADS -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorConversion.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorConversion.h deleted file mode 100644 index 09d2da9..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorConversion.h +++ /dev/null @@ -1,456 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H - -namespace Eigen { - -/** \class TensorConversionOp - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor conversion class. This class makes it possible to vectorize - * type casting operations when the number of scalars per packet in the source - * and the destination type differ - */ -namespace internal { -template<typename TargetType, typename XprType> -struct traits<TensorConversionOp<TargetType, XprType> > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef TargetType Scalar; - typedef typename traits<XprType>::StorageKind StorageKind; - typedef typename traits<XprType>::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = traits<XprType>::NumDimensions; - static const int Layout = traits<XprType>::Layout; - enum { Flags = 0 }; - typedef typename TypeConversion<Scalar, typename traits<XprType>::PointerType>::type PointerType; -}; - -template<typename TargetType, typename XprType> -struct eval<TensorConversionOp<TargetType, XprType>, Eigen::Dense> -{ - typedef const TensorConversionOp<TargetType, XprType>& type; -}; - -template<typename TargetType, typename XprType> -struct nested<TensorConversionOp<TargetType, XprType>, 1, typename eval<TensorConversionOp<TargetType, XprType> >::type> -{ - typedef TensorConversionOp<TargetType, XprType> type; -}; - -} // end namespace internal - - -template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio> -struct PacketConverter; - -template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket> -struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 1> { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketConverter(const TensorEvaluator& impl) - : m_impl(impl) {} - - template<int LoadMode, typename Index> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { - return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<LoadMode>(index)); - } - - private: - const TensorEvaluator& m_impl; -}; - - -template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket> -struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 2, 1> { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketConverter(const TensorEvaluator& impl) - : m_impl(impl) {} - - template<int LoadMode, typename Index> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { - const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size; - - SrcPacket src1 = m_impl.template packet<LoadMode>(index); - SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize); - TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2); - return result; - } - - private: - const TensorEvaluator& m_impl; -}; - -template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket> -struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 4, 1> { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketConverter(const TensorEvaluator& impl) - : m_impl(impl) {} - - template<int LoadMode, typename Index> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { - const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size; - - SrcPacket src1 = m_impl.template packet<LoadMode>(index); - SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize); - SrcPacket src3 = m_impl.template packet<LoadMode>(index + 2 * SrcPacketSize); - SrcPacket src4 = m_impl.template packet<LoadMode>(index + 3 * SrcPacketSize); - TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2, src3, src4); - return result; - } - - private: - const TensorEvaluator& m_impl; -}; - -template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket> -struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 8, 1> { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketConverter(const TensorEvaluator& impl) - : m_impl(impl) {} - - template<int LoadMode, typename Index> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { - const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size; - - SrcPacket src1 = m_impl.template packet<LoadMode>(index); - SrcPacket src2 = m_impl.template packet<LoadMode>(index + 1 * SrcPacketSize); - SrcPacket src3 = m_impl.template packet<LoadMode>(index + 2 * SrcPacketSize); - SrcPacket src4 = m_impl.template packet<LoadMode>(index + 3 * SrcPacketSize); - SrcPacket src5 = m_impl.template packet<LoadMode>(index + 4 * SrcPacketSize); - SrcPacket src6 = m_impl.template packet<LoadMode>(index + 5 * SrcPacketSize); - SrcPacket src7 = m_impl.template packet<LoadMode>(index + 6 * SrcPacketSize); - SrcPacket src8 = m_impl.template packet<LoadMode>(index + 7 * SrcPacketSize); - TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2, src3, src4, src5, src6, src7, src8); - return result; - } - - private: - const TensorEvaluator& m_impl; -}; - -template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int TgtCoeffRatio> -struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, TgtCoeffRatio> { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketConverter(const TensorEvaluator& impl) - : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {} - - template<int LoadMode, typename Index> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { - const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size; - // Only call m_impl.packet() when we have direct access to the underlying data. This - // ensures that we don't compute the subexpression twice. We may however load some - // coefficients twice, but in practice this doesn't negatively impact performance. - if (m_impl.data() && (index + SrcPacketSize < m_maxIndex)) { - // Force unaligned memory loads since we can't ensure alignment anymore - return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<Unaligned>(index)); - } else { - const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size; - typedef typename internal::unpacket_traits<SrcPacket>::type SrcType; - typedef typename internal::unpacket_traits<TgtPacket>::type TgtType; - internal::scalar_cast_op<SrcType, TgtType> converter; - EIGEN_ALIGN_MAX typename internal::unpacket_traits<TgtPacket>::type values[TgtPacketSize]; - EIGEN_UNROLL_LOOP - for (int i = 0; i < TgtPacketSize; ++i) { - values[i] = converter(m_impl.coeff(index+i)); - } - TgtPacket rslt = internal::pload<TgtPacket>(values); - return rslt; - } - } - - private: - const TensorEvaluator& m_impl; - const typename TensorEvaluator::Index m_maxIndex; -}; - -template<typename TargetType, typename XprType> -class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprType>, ReadOnlyAccessors> -{ - public: - typedef typename internal::traits<TensorConversionOp>::Scalar Scalar; - typedef typename internal::traits<TensorConversionOp>::StorageKind StorageKind; - typedef typename internal::traits<TensorConversionOp>::Index Index; - typedef typename internal::nested<TensorConversionOp>::type Nested; - typedef Scalar CoeffReturnType; - typedef typename NumTraits<Scalar>::Real RealScalar; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConversionOp(const XprType& xpr) - : m_xpr(xpr) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; -}; - -template <bool SameType, typename Eval, typename EvalPointerType> struct ConversionSubExprEval { - static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) { - impl.evalSubExprsIfNeeded(NULL); - return true; - } -}; - -template <typename Eval, typename EvalPointerType> struct ConversionSubExprEval<true, Eval, EvalPointerType> { - static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) { - return impl.evalSubExprsIfNeeded(data); - } -}; - -#ifdef EIGEN_USE_THREADS -template <bool SameType, typename Eval, typename EvalPointerType, - typename EvalSubExprsCallback> -struct ConversionSubExprEvalAsync { - static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType, EvalSubExprsCallback done) { - impl.evalSubExprsIfNeededAsync(nullptr, std::move(done)); - } -}; - -template <typename Eval, typename EvalPointerType, - typename EvalSubExprsCallback> -struct ConversionSubExprEvalAsync<true, Eval, EvalPointerType, - EvalSubExprsCallback> { - static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType data, EvalSubExprsCallback done) { - impl.evalSubExprsIfNeededAsync(data, std::move(done)); - } -}; -#endif - -namespace internal { - -template <typename SrcType, typename TargetType, bool IsSameT> -struct CoeffConv { - template <typename ArgType, typename Device> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl, Index index) { - internal::scalar_cast_op<SrcType, TargetType> converter; - return converter(impl.coeff(index)); - } -}; - -template <typename SrcType, typename TargetType> -struct CoeffConv<SrcType, TargetType, true> { - template <typename ArgType, typename Device> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl, Index index) { - return impl.coeff(index); - } -}; - -template <typename SrcPacket, typename TargetPacket, int LoadMode, bool ActuallyVectorize, bool IsSameT> -struct PacketConv { - typedef typename internal::unpacket_traits<SrcPacket>::type SrcType; - typedef typename internal::unpacket_traits<TargetPacket>::type TargetType; - - static const int PacketSize = internal::unpacket_traits<TargetPacket>::size; - - template <typename ArgType, typename Device> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) { - internal::scalar_cast_op<SrcType, TargetType> converter; - EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize]; - EIGEN_UNROLL_LOOP - for (int i = 0; i < PacketSize; ++i) { - values[i] = converter(impl.coeff(index+i)); - } - TargetPacket rslt = internal::pload<TargetPacket>(values); - return rslt; - } -}; - -template <typename SrcPacket, typename TargetPacket, int LoadMode, bool IsSameT> -struct PacketConv<SrcPacket, TargetPacket, LoadMode, true, IsSameT> { - typedef typename internal::unpacket_traits<SrcPacket>::type SrcType; - typedef typename internal::unpacket_traits<TargetPacket>::type TargetType; - - template <typename ArgType, typename Device> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) { - const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio; - const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio; - PacketConverter<TensorEvaluator<ArgType, Device>, SrcPacket, TargetPacket, - SrcCoeffRatio, TgtCoeffRatio> converter(impl); - return converter.template packet<LoadMode>(index); - } -}; - -template <typename SrcPacket, typename TargetPacket, int LoadMode> -struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/false, /*IsSameT=*/true> { - typedef typename internal::unpacket_traits<TargetPacket>::type TargetType; - static const int PacketSize = internal::unpacket_traits<TargetPacket>::size; - - template <typename ArgType, typename Device> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) { - EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) values[i] = impl.coeff(index+i); - return internal::pload<TargetPacket>(values); - } -}; - -template <typename SrcPacket, typename TargetPacket, int LoadMode> -struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/true, /*IsSameT=*/true> { - template <typename ArgType, typename Device> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) { - return impl.template packet<LoadMode>(index); - } -}; - -} // namespace internal - -// Eval as rvalue -template<typename TargetType, typename ArgType, typename Device> -struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> -{ - typedef TensorConversionOp<TargetType, ArgType> XprType; - typedef typename XprType::Index Index; - typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; - typedef TargetType Scalar; - typedef TargetType CoeffReturnType; - typedef typename internal::remove_all<typename internal::traits<ArgType>::Scalar>::type SrcType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef typename PacketType<SrcType, Device>::type PacketSourceType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - static const bool IsSameType = internal::is_same<TargetType, SrcType>::value; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - IsAligned = false, - PacketAccess = - #ifndef EIGEN_USE_SYCL - true, - #else - TensorEvaluator<ArgType, Device>::PacketAccess & - internal::type_casting_traits<SrcType, TargetType>::VectorizedCast, - #endif - BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess, - PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess, - Layout = TensorEvaluator<ArgType, Device>::Layout, - RawAccess = false - }; - - static const int NumDims = internal::array_size<Dimensions>::value; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; - - typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock - ArgTensorBlock; - - struct TensorConversionOpBlockFactory { - template <typename ArgXprType> - struct XprType { - typedef TensorConversionOp<TargetType, const ArgXprType> type; - }; - - template <typename ArgXprType> - typename XprType<ArgXprType>::type expr(const ArgXprType& expr) const { - return typename XprType<ArgXprType>::type(expr); - } - }; - - typedef internal::TensorUnaryExprBlock<TensorConversionOpBlockFactory, - ArgTensorBlock> - TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) - { - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) - { - return ConversionSubExprEval<IsSameType, TensorEvaluator<ArgType, Device>, EvaluatorPointerType>::run(m_impl, data); - } - -#ifdef EIGEN_USE_THREADS - template <typename EvalSubExprsCallback> - EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType data, EvalSubExprsCallback done) { - ConversionSubExprEvalAsync<IsSameType, TensorEvaluator<ArgType, Device>, - EvaluatorPointerType, - EvalSubExprsCallback>::run(m_impl, data, std::move(done)); - } -#endif - - EIGEN_STRONG_INLINE void cleanup() - { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return internal::CoeffConv<SrcType, TargetType, IsSameType>::run(m_impl,index); - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType - packet(Index index) const { - // If we are not going to do the cast, we just need to check that base - // TensorEvaluator has packet access. Otherwise we also need to make sure, - // that we have an implementation of vectorized cast. - const bool Vectorizable = - IsSameType - ? TensorEvaluator<ArgType, Device>::PacketAccess - : int(TensorEvaluator<ArgType, Device>::PacketAccess) & - int(internal::type_casting_traits<SrcType, TargetType>::VectorizedCast); - - return internal::PacketConv<PacketSourceType, PacketReturnType, LoadMode, - Vectorizable, IsSameType>::run(m_impl, index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double cast_cost = TensorOpCost::CastCost<SrcType, TargetType>(); - if (vectorized) { - const double SrcCoeffRatio = - internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio; - const double TgtCoeffRatio = - internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio; - return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) + - TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize)); - } else { - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - return m_impl.getResourceRequirements(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - return TensorBlock(m_impl.block(desc, scratch), - TensorConversionOpBlockFactory()); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - - /// required by sycl in order to extract the sycl accessor - const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - - protected: - TensorEvaluator<ArgType, Device> m_impl; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolution.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolution.h deleted file mode 100644 index b20f80b..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolution.h +++ /dev/null @@ -1,1132 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H - -namespace Eigen { - -/** \class TensorConvolution - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor convolution class. - * - * - */ -namespace internal { - -template <typename Index, typename InputDims, int NumKernelDims, int Layout> -class IndexMapper { - public: - IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims, - const array<Index, NumKernelDims>& indices) { - - array<Index, NumDims> dimensions = input_dims; - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = indices[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - dimensions[index] = result_dim; - } - - array<Index, NumDims> inputStrides; - array<Index, NumDims> outputStrides; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - inputStrides[0] = 1; - outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; - outputStrides[i] = outputStrides[i-1] * dimensions[i-1]; - } - } else { - inputStrides[NumDims - 1] = 1; - outputStrides[NumDims - 1] = 1; - for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) { - inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; - outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1]; - } - } - - array<Index, NumDims> gpuInputDimensions; - array<Index, NumDims> gpuOutputDimensions; - array<Index, NumDims> tmp = dimensions; - array<Index, NumDims> ordering; - const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) - ? 0 - : NumDims - NumKernelDims; - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = i + offset; - ordering[index] = indices[i]; - tmp[indices[i]] = -1; - gpuInputDimensions[index] = input_dims[indices[i]]; - gpuOutputDimensions[index] = dimensions[indices[i]]; - } - - int written = static_cast<int>(Layout) == static_cast<int>(ColMajor) - ? NumKernelDims - : 0; - for (int i = 0; i < NumDims; ++i) { - if (tmp[i] >= 0) { - ordering[written] = i; - gpuInputDimensions[written] = input_dims[i]; - gpuOutputDimensions[written] = dimensions[i]; - ++written; - } - } - - for (int i = 0; i < NumDims; ++i) { - m_inputStrides[i] = inputStrides[ordering[i]]; - m_outputStrides[i] = outputStrides[ordering[i]]; - } - - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - for (int i = 0; i < NumDims; ++i) { - if (i > NumKernelDims) { - m_gpuInputStrides[i] = - m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1]; - m_gpuOutputStrides[i] = - m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1]; - } else { - m_gpuInputStrides[i] = 1; - m_gpuOutputStrides[i] = 1; - } - } - } else { - for (int i = NumDims - 1; i >= 0; --i) { - if (static_cast<size_t>(i + 1) < offset) { - m_gpuInputStrides[i] = - m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1]; - m_gpuOutputStrides[i] = - m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1]; - } else { - m_gpuInputStrides[i] = 1; - m_gpuOutputStrides[i] = 1; - } - } - } - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const { - Index inputIndex = 0; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - for (int d = NumDims - 1; d > NumKernelDims; --d) { - const Index idx = p / m_gpuInputStrides[d]; - inputIndex += idx * m_inputStrides[d]; - p -= idx * m_gpuInputStrides[d]; - } - inputIndex += p * m_inputStrides[NumKernelDims]; - } else { - std::ptrdiff_t limit = 0; - if (NumKernelDims < NumDims) { - limit = NumDims - NumKernelDims - 1; - } - for (int d = 0; d < limit; ++d) { - const Index idx = p / m_gpuInputStrides[d]; - inputIndex += idx * m_inputStrides[d]; - p -= idx * m_gpuInputStrides[d]; - } - inputIndex += p * m_inputStrides[limit]; - } - return inputIndex; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const { - Index outputIndex = 0; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - for (int d = NumDims - 1; d > NumKernelDims; --d) { - const Index idx = p / m_gpuOutputStrides[d]; - outputIndex += idx * m_outputStrides[d]; - p -= idx * m_gpuOutputStrides[d]; - } - outputIndex += p * m_outputStrides[NumKernelDims]; - } else { - std::ptrdiff_t limit = 0; - if (NumKernelDims < NumDims) { - limit = NumDims - NumKernelDims - 1; - } - for (int d = 0; d < limit; ++d) { - const Index idx = p / m_gpuOutputStrides[d]; - outputIndex += idx * m_outputStrides[d]; - p -= idx * m_gpuOutputStrides[d]; - } - outputIndex += p * m_outputStrides[limit]; - } - return outputIndex; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const { - const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_inputStrides[offset]; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const { - const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_outputStrides[offset]; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const { - const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const { - const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const { - const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + - k * m_inputStrides[offset + 2]; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { - const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + - k * m_outputStrides[offset + 2]; - } - - private: - static const int NumDims = internal::array_size<InputDims>::value; - array<Index, NumDims> m_inputStrides; - array<Index, NumDims> m_outputStrides; - array<Index, NumDims> m_gpuInputStrides; - array<Index, NumDims> m_gpuOutputStrides; -}; - - - -template<typename Dimensions, typename InputXprType, typename KernelXprType> -struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename promote_storage_type<typename InputXprType::Scalar, - typename KernelXprType::Scalar>::ret Scalar; - typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind, - typename traits<KernelXprType>::StorageKind>::ret StorageKind; - typedef typename promote_index_type<typename traits<InputXprType>::Index, - typename traits<KernelXprType>::Index>::type Index; - typedef typename InputXprType::Nested LhsNested; - typedef typename KernelXprType::Nested RhsNested; - typedef typename remove_reference<LhsNested>::type _LhsNested; - typedef typename remove_reference<RhsNested>::type _RhsNested; - static const int NumDimensions = traits<InputXprType>::NumDimensions; - static const int Layout = traits<InputXprType>::Layout; - typedef typename conditional<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val, - typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType>::type PointerType; - - enum { - Flags = 0 - }; -}; - -template<typename Dimensions, typename InputXprType, typename KernelXprType> -struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense> -{ - typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type; -}; - -template<typename Dimensions, typename InputXprType, typename KernelXprType> -struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type> -{ - typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type; -}; - -} // end namespace internal - - - -template<typename Indices, typename InputXprType, typename KernelXprType> -class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType, - typename KernelXprType::CoeffReturnType>::ret CoeffReturnType; - typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested; - typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims) - : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Indices& indices() const { return m_indices; } - - /** \returns the nested expressions */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const typename internal::remove_all<typename InputXprType::Nested>::type& - inputExpression() const { return m_input_xpr; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const typename internal::remove_all<typename KernelXprType::Nested>::type& - kernelExpression() const { return m_kernel_xpr; } - - protected: - typename InputXprType::Nested m_input_xpr; - typename KernelXprType::Nested m_kernel_xpr; - const Indices m_indices; -}; - - -template<typename Indices, typename InputArgType, typename KernelArgType, typename Device> -struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device> -{ - typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; - - static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value; - static const int NumKernelDims = internal::array_size<Indices>::value; - typedef typename XprType::Index Index; - typedef DSizes<Index, NumDims> Dimensions; - - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef StorageMemory<Scalar, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - IsAligned = int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned), - PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) & int(TensorEvaluator<KernelArgType, Device>::PacketAccess), - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorEvaluator<InputArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) - { - EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - - const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions(); - const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); - - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - m_inputStride[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1]; - } - } else { - m_inputStride[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1]; - } - } - - m_dimensions = m_inputImpl.dimensions(); - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = op.indices()[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - m_dimensions[index] = result_dim; - if (i > 0) { - m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1]; - } else { - m_kernelStride[0] = 1; - } - m_indexStride[i] = m_inputStride[index]; - } - - m_outputStride[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1]; - } - } else { - for (int i = NumKernelDims - 1; i >= 0; --i) { - const Index index = op.indices()[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - m_dimensions[index] = result_dim; - if (i < NumKernelDims - 1) { - m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1]; - } else { - m_kernelStride[NumKernelDims - 1] = 1; - } - m_indexStride[i] = m_inputStride[index]; - } - - m_outputStride[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { - m_inputImpl.evalSubExprsIfNeeded(NULL); - preloadKernel(); - return true; - } - EIGEN_STRONG_INLINE void cleanup() { - m_inputImpl.cleanup(); - if (m_local_kernel) { - m_device.deallocate((void*)m_kernel); - m_local_kernel = false; - } - m_kernel = NULL; - } - - void evalTo(typename XprType::Scalar* buffer) { - evalSubExprsIfNeeded(NULL); - for (int i = 0; i < dimensions().TotalSize(); ++i) { - buffer[i] += coeff(i); - } - cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - CoeffReturnType result = CoeffReturnType(0); - convolve(firstInput(index), 0, NumKernelDims-1, result); - return result; - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const - { - Index indices[2] = {index, index+PacketSize-1}; - Index startInputs[2] = {0, 0}; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_outputStride[i]; - const Index idx1 = indices[1] / m_outputStride[i]; - startInputs[0] += idx0 * m_inputStride[i]; - startInputs[1] += idx1 * m_inputStride[i]; - indices[0] -= idx0 * m_outputStride[i]; - indices[1] -= idx1 * m_outputStride[i]; - } - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx0 = indices[0] / m_outputStride[i]; - const Index idx1 = indices[1] / m_outputStride[i]; - startInputs[0] += idx0 * m_inputStride[i]; - startInputs[1] += idx1 * m_inputStride[i]; - indices[0] -= idx0 * m_outputStride[i]; - indices[1] -= idx1 * m_outputStride[i]; - } - } - startInputs[0] += indices[0]; - startInputs[1] += indices[1]; - - if (startInputs[1]-startInputs[0] == PacketSize-1) { - PacketReturnType result = internal::pset1<PacketReturnType>(0); - convolvePacket(startInputs[0], 0, NumKernelDims-1, result); - return result; - } else { - EIGEN_ALIGN_MAX Scalar data[PacketSize]; - data[0] = Scalar(0); - convolve(startInputs[0], 0, NumKernelDims-1, data[0]); - for (int i = 1; i < PacketSize-1; ++i) { - data[i] = Scalar(0); - convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]); - } - data[PacketSize-1] = Scalar(0); - convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]); - return internal::pload<PacketReturnType>(data); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double kernel_size = m_kernelImpl.dimensions().TotalSize(); - // We ignore the use of fused multiply-add. - const double convolve_compute_cost = - TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); - const double firstIndex_compute_cost = - NumDims * - (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + - TensorOpCost::DivCost<Index>()); - return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + - kernel_size * (m_inputImpl.costPerCoeff(vectorized) + - m_kernelImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, convolve_compute_cost, vectorized, - PacketSize)); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - - private: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { - Index startInput = 0; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStride[i]; - startInput += idx * m_inputStride[i]; - index -= idx * m_outputStride[i]; - } - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStride[i]; - startInput += idx * m_inputStride[i]; - index -= idx * m_outputStride[i]; - } - } - startInput += index; - return startInput; - } - - EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { - for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { - const Index input = firstIndex + j * m_indexStride[DimIndex]; - const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; - if (DimIndex > 0) { - convolve(input, kernel, DimIndex-1, accum); - } else { - accum += m_inputImpl.coeff(input) * m_kernel[kernel]; - } - } - } - - template <typename Packet> - EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { - for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { - const Index input = firstIndex + j * m_indexStride[DimIndex]; - const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; - if (DimIndex > 0) { - convolvePacket(input, kernel, DimIndex-1, accum); - } else { - accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum); - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { - // Don't make a local copy of the kernel unless we have to (i.e. it's an - // expression that needs to be evaluated) - const Scalar* in_place = m_kernelImpl.data(); - if (in_place) { - m_kernel = in_place; - m_local_kernel = false; - } else { - size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); - Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz); - typedef TensorEvalToOp<const KernelArgType> EvalTo; - EvalTo evalToTmp(local, m_kernelArg); - const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value; - internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device); - - m_kernel = local; - m_local_kernel = true; - } - } - - array<Index, NumDims> m_inputStride; - array<Index, NumDims> m_outputStride; - - array<Index, NumKernelDims> m_indexStride; - array<Index, NumKernelDims> m_kernelStride; - TensorEvaluator<InputArgType, Device> m_inputImpl; - TensorEvaluator<KernelArgType, Device> m_kernelImpl; - Dimensions m_dimensions; - - KernelArgType m_kernelArg; - const Scalar* m_kernel; - bool m_local_kernel; - const Device EIGEN_DEVICE_REF m_device; -}; - - - - -// Use an optimized implementation of the evaluation code for GPUs whenever possible. -#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) - -template <int StaticKernelSize> -struct GetKernelSize { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const { - return StaticKernelSize; - } -}; -template <> -struct GetKernelSize<Dynamic> { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const { - return kernelSize; - } -}; - -template <typename InputEvaluator, typename Index, typename InputDims, - int StaticKernelSize> -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D( - InputEvaluator eval, - const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout> - indexMapper, - const float* __restrict kernel, const int numPlanes, const int numX, - const int maxX, const int kernelSize, float* buffer) { -#if defined(EIGEN_HIPCC) - HIP_DYNAMIC_SHARED(float, s) -#else - extern __shared__ float s[]; -#endif - - const int first_x = blockIdx.x * maxX; - const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; - const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize); - const int num_x_output = last_x - first_x + 1; - - const int first_plane = blockIdx.y * blockDim.y; - const int plane_stride = blockDim.y * gridDim.y; - - for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { - // Load inputs to shared memory - const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); - const int plane_kernel_offset = threadIdx.y * num_x_input; - #pragma unroll - for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { - const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x); - s[i + plane_kernel_offset] = eval.coeff(tensor_index); - } - - __syncthreads(); - - // Compute the convolution - const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); - - #pragma unroll - for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { - const int kernel_offset = plane_kernel_offset + i; - float result = 0.0f; - #pragma unroll - for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) { - result += s[k + kernel_offset] * kernel[k]; - } - const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x); - buffer[tensor_index] = result; - } - __syncthreads(); - } -}; - -template <typename InputEvaluator, typename Index, typename InputDims, - int StaticKernelSizeX, int StaticKernelSizeY> -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D( - InputEvaluator eval, - const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout> - indexMapper, - const float* __restrict kernel, const int numPlanes, const int numX, - const int maxX, const int numY, const int maxY, const int kernelSizeX, - const int kernelSizeY, float* buffer) { -#if defined(EIGEN_HIPCC) - HIP_DYNAMIC_SHARED(float, s) -#else - extern __shared__ float s[]; -#endif - - const int first_x = blockIdx.x * maxX; - const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; - const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX); - const int num_x_output = last_x - first_x + 1; - - const int first_y = blockIdx.y * maxY; - const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; - const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY); - const int num_y_output = last_y - first_y + 1; - - const int first_plane = blockIdx.z * blockDim.z; - const int plane_stride = blockDim.z * gridDim.z; - - for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { - - const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); - const int plane_kernel_offset = threadIdx.z * num_y_input; - - // Load inputs to shared memory - #pragma unroll - for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { - const int input_offset = num_x_input * (j + plane_kernel_offset); - #pragma unroll - for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { - const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y); - s[i + input_offset] = eval.coeff(tensor_index); - } - } - - __syncthreads(); - - // Convolution - const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); - - #pragma unroll - for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { - #pragma unroll - for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { - float result = 0.0f; - #pragma unroll - for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) { - const int kernel_offset = kernelSizeX * l; - const int input_offset = i + num_x_input * (j + l + plane_kernel_offset); - #pragma unroll - for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) { - result += s[k + input_offset] * kernel[k + kernel_offset]; - } - } - const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y); - buffer[tensor_index] = result; - } - } - - __syncthreads(); - } -}; - -template <typename InputEvaluator, typename Index, typename InputDims> -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D( - InputEvaluator eval, - const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> - indexMapper, - const float* __restrict kernel, const size_t numPlanes, const size_t numX, - const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, - const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, - const size_t kernelSizeZ, float* buffer) { -#if defined(EIGEN_HIPCC) - HIP_DYNAMIC_SHARED(float, s) -#else - extern __shared__ float s[]; -#endif - - // Load inputs to shared memory - const int first_x = blockIdx.x * maxX; - const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; - const int num_x_input = last_x - first_x + kernelSizeX; - - const int first_y = blockIdx.y * maxY; - const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; - const int num_y_input = last_y - first_y + kernelSizeY; - - const int first_z = blockIdx.z * maxZ; - const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; - const int num_z_input = last_z - first_z + kernelSizeZ; - - for (int p = 0; p < numPlanes; ++p) { - - const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); - const int plane_kernel_offset = 0; - - for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { - for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { - for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { - const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); - s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); - } - } - } - - __syncthreads(); - - // Convolution - const int num_z_output = last_z - first_z + 1; - const int num_y_output = last_y - first_y + 1; - const int num_x_output = last_x - first_x + 1; - const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); - - for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { - for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { - for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { - float result = 0.0f; - for (int n = 0; n < kernelSizeZ; ++n) { - for (int m = 0; m < kernelSizeY; ++m) { - for (int l = 0; l < kernelSizeX; ++l) { - result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; - } - } - } - const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); - buffer[tensor_index] = result; - } - } - } - __syncthreads(); - } -}; - - - -template<typename Indices, typename InputArgType, typename KernelArgType> -struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice> -{ - typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; - - static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value; - static const int NumKernelDims = internal::array_size<Indices>::value; - typedef typename XprType::Index Index; - typedef DSizes<Index, NumDims> Dimensions; - typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions; - - enum { - IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned, - PacketAccess = false, - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - TensorEvaluator(const XprType& op, const GpuDevice& device) - : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) - { - EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - - const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions(); - const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); - - m_dimensions = m_inputImpl.dimensions(); - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = op.indices()[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - m_dimensions[index] = result_dim; - } - } - - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType; - typedef typename InputArgType::Scalar Scalar; - static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - preloadKernel(); - m_inputImpl.evalSubExprsIfNeeded(NULL); - if (data) { - executeEval(data); - return false; - } else { - m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); - executeEval(m_buf); - return true; - } - } - - EIGEN_STRONG_INLINE void cleanup() { - m_inputImpl.cleanup(); - if (m_buf) { - m_device.deallocate(m_buf); - m_buf = NULL; - } - if (m_local_kernel) { - m_device.deallocate((void*)m_kernel); - m_local_kernel = false; - } - m_kernel = NULL; - } - - EIGEN_STRONG_INLINE void preloadKernel() { - // Don't make a local copy of the kernel unless we have to (i.e. it's an - // expression that needs to be evaluated) - const Scalar* in_place = m_kernelImpl.data(); - if (in_place) { - m_kernel = in_place; - m_local_kernel = false; - } else { - size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); - Scalar* local = (Scalar*)m_device.allocate(kernel_sz); - typedef TensorEvalToOp<const KernelArgType> EvalTo; - EvalTo evalToTmp(local, m_kernelArg); - const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value; - internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device); - - m_kernel = local; - m_local_kernel = true; - } - } - - static unsigned int ceil(unsigned int num, unsigned int denom) { - const unsigned int rounded_toward_zero = num / denom; - if (num > rounded_toward_zero * denom) { - return rounded_toward_zero + 1; - } - return rounded_toward_zero; - } - - void executeEval(Scalar* data) const { - typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims; - - const int maxSharedMem = m_device.sharedMemPerBlock(); - const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock(); - const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock; - const int numMultiProcessors = m_device.getNumGpuMultiProcessors(); - const int warpSize = 32; - - switch (NumKernelDims) { - case 1: { - const int kernel_size = m_kernelImpl.dimensions().TotalSize(); - - const int numX = dimensions()[m_indices[0]]; - const int numP = dimensions().TotalSize() / numX; - int maxX; - dim3 block_size; - - const int single_stride_dim = - static_cast<int>(Layout) == static_cast<int>(ColMajor) - ? 0 - : m_inputImpl.dimensions().rank() - 1; - if (m_indices[0] == single_stride_dim) { - // Maximum the reuse - const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; - maxX = numext::mini<int>(inner_dim, numX); - const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); - block_size.x = numext::mini(maxThreadsPerBlock, maxX); - block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP); - } - else { - // Read as much as possible alongside the inner most dimension, that is the plane - const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar)); - const int maxP = numext::mini<int>(inner_dim, numP); - maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); - - block_size.x = numext::mini(warpSize, maxX); - block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP); - } - - const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); - gpu_assert(shared_mem <= maxSharedMem); - - const int num_x_blocks = ceil(numX, maxX); - const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); - const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); - - dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y))); - - - //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; - - const array<Index, 1> indices(m_indices[0]); - const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]); - internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper( - m_inputImpl.dimensions(), kernel_dims, indices); - switch(kernel_size) { - case 4: { - LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); - break; - } - case 7: { - LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); - break; - } - default: { - LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); - } - } - break; - } - - case 2: { - const int idxX = - static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1; - const int idxY = - static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0; - const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; - const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; - - const int numX = dimensions()[m_indices[idxX]]; - const int numY = dimensions()[m_indices[idxY]]; - const int numP = dimensions().TotalSize() / (numX*numY); - - const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); - - // Snap maxX to warp size - int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; - const int maxX = numext::mini<int>(inner_dim, numX); - const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); - const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); - - dim3 block_size; - block_size.x = numext::mini(1024, maxX); - block_size.y = numext::mini<int>(1024/block_size.x, maxY); - block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP); - - const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); - gpu_assert(shared_mem <= maxSharedMem); - - const int num_x_blocks = ceil(numX, maxX); - const int num_y_blocks = ceil(numY, maxY); - const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); - const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); - - dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z))); - - - //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; - - const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]); - const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX], - m_kernelImpl.dimensions()[idxY]); - internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper( - m_inputImpl.dimensions(), kernel_dims, indices); - switch (kernel_size_x) { - case 4: { - switch (kernel_size_y) { - case 7: { - LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); - break; - } - default: { - LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); - break; - } - } - break; - } - case 7: { - switch (kernel_size_y) { - case 4: { - LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); - break; - } - default: { - LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); - break; - } - } - break; - } - default: { - LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); - break; - } - } - break; - } - - case 3: { - const int idxX = - static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2; - const int idxY = - static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1; - const int idxZ = - static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0; - - const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; - const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; - const int kernel_size_z = m_kernelImpl.dimensions()[idxZ]; - - const int numX = dimensions()[m_indices[idxX]]; - const int numY = dimensions()[m_indices[idxY]]; - const int numZ = dimensions()[m_indices[idxZ]]; - const int numP = dimensions().TotalSize() / (numX*numY*numZ); - - const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); - const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY)); - const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ)); - - dim3 block_size; - block_size.x = numext::mini(32, maxX); - block_size.y = numext::mini(32, maxY); - block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ); - dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); - - const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); - gpu_assert(shared_mem <= maxSharedMem); - - //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; - const array<Index, 3> indices(m_indices[idxX], m_indices[idxY], - m_indices[idxZ]); - const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX], - m_kernelImpl.dimensions()[idxY], - m_kernelImpl.dimensions()[idxZ]); - internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper( - m_inputImpl.dimensions(), kernel_dims, indices); - - LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); - break; - } - - default: { - EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - eigen_assert(m_buf); - eigen_assert(index < m_dimensions.TotalSize()); - return m_buf[index]; - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const - { - eigen_assert(m_buf); - eigen_assert(index < m_dimensions.TotalSize()); - return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost - // model. - const double kernel_size = m_kernelImpl.dimensions().TotalSize(); - // We ignore the use of fused multiply-add. - const double convolve_compute_cost = - TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); - const double firstIndex_compute_cost = - NumDims * - (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + - TensorOpCost::DivCost<Index>()); - return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + - kernel_size * (m_inputImpl.costPerCoeff(vectorized) + - m_kernelImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, convolve_compute_cost, vectorized, - PacketSize)); - } - - private: - // No assignment (copies are needed by the kernels) - TensorEvaluator& operator = (const TensorEvaluator&); - - TensorEvaluator<InputArgType, GpuDevice> m_inputImpl; - TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl; - KernelArgType m_kernelArg; - Indices m_indices; - Dimensions m_dimensions; - Scalar* m_buf; - const Scalar* m_kernel; - bool m_local_kernel; - - const GpuDevice& m_device; -}; -#endif - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolutionSycl.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolutionSycl.h deleted file mode 100644 index 033318f..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolutionSycl.h +++ /dev/null @@ -1,544 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> - -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H - -namespace Eigen { - -/** \class TensorConvolution - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor convolution class. - * - * - */ - -enum class convolution_type { CONV1D, CONV2D, CONV3D }; -template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims, - typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim> -struct EigenConvolutionKernel; -template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims, - typename Kernel_accessor, typename Buffer_accessor> -struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor, - Buffer_accessor, convolution_type::CONV1D> { - typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> - Local_accessor; - Local_accessor local_acc; - Evaluator device_evaluator; - Kernel_accessor kernel_filter; - Buffer_accessor buffer_acc; - internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper; - const size_t kernelSize; - const cl::sycl::range<2> input_range; - EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, - Buffer_accessor buffer_acc_, - internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_, - const size_t kernelSize_, const cl::sycl::range<2> input_range_) - : local_acc(local_acc_), - device_evaluator(device_evaluator_), - kernel_filter(kernel_filter_), - buffer_acc(buffer_acc_), - indexMapper(indexMapper_), - kernelSize(kernelSize_), - input_range(input_range_) {} - - template <typename BooleanDim2> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) { - return (boolean_check[0] && boolean_check[1]); - } - void operator()(cl::sycl::nd_item<2> itemID) { - auto buffer_ptr = buffer_acc.get_pointer(); - auto kernel_ptr = kernel_filter.get_pointer(); - // the required row to be calculated for the for each plane in shered memory - const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1); - const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input; - const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0]; - const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1)); - /// fill the shared memory - for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) { - const size_t local_index = i + plane_kernel_offset; - const size_t tensor_index = - plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset); - - local_acc[local_index] = - (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1]) - ? device_evaluator.coeff(tensor_index) - : CoeffReturnType(0); - } - - itemID.barrier(cl::sycl::access::fence_space::local_space); - - // calculate the convolution // output start x - const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]); - if (boundary_check(itemID.get_global_id() < input_range)) { - CoeffReturnType result = static_cast<CoeffReturnType>(0); - const size_t index = plane_kernel_offset + itemID.get_local_id(0); - for (size_t k = 0; k < kernelSize; ++k) { - result += (local_acc[k + index] * kernel_ptr[k]); - } - const size_t tensor_index = - indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) + - indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start); - buffer_ptr[tensor_index] = result; - } - } -}; - -template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims, - typename Kernel_accessor, typename Buffer_accessor> -struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor, - Buffer_accessor, convolution_type::CONV2D> { - typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> - Local_accessor; - Local_accessor local_acc; - Evaluator device_evaluator; - Kernel_accessor kernel_filter; - Buffer_accessor buffer_acc; - internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper; - const cl::sycl::range<2> kernel_size; - const cl::sycl::range<3> input_range; - EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, - Buffer_accessor buffer_acc_, - internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_, - const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_) - : local_acc(local_acc_), - device_evaluator(device_evaluator_), - kernel_filter(kernel_filter_), - buffer_acc(buffer_acc_), - indexMapper(indexMapper_), - kernel_size(kernel_size_), - input_range(input_range_) {} - template <typename BooleanDim3> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) { - return (boolean_check[0] && boolean_check[1] && boolean_check[2]); - } - - void operator()(cl::sycl::nd_item<3> itemID) { - auto buffer_ptr = buffer_acc.get_pointer(); - auto kernel_ptr = kernel_filter.get_pointer(); - // the required row to be calculated for the for each plane in shered memory - const auto num_input = cl::sycl::range<2>{ - (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)}; - - const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2)); - const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1]; - - const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0], - itemID.get_group(1) * itemID.get_local_range()[1]}; - - // fill the local memory - bool in_range_dim2 = itemID.get_global_id(2) < input_range[2]; - for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) { - const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset); - bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1)); - for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) { - const size_t local_index = i + local_input_offset; - const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( - i + input_offset[0], j + input_offset[1]); - local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) && - in_range_dim1 && in_range_dim2) - ? device_evaluator.coeff(tensor_index) - : CoeffReturnType(0); - } - } - - itemID.barrier(cl::sycl::access::fence_space::local_space); - - // output offset start for each thread - const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0], - itemID.get_group(1) * itemID.get_local_range()[1]}; - - if (boundary_check(itemID.get_global_id() < input_range)) { - CoeffReturnType result = static_cast<CoeffReturnType>(0); - - for (size_t j = 0; j < kernel_size[1]; j++) { - size_t kernel_offset = kernel_size[0] * j; - const size_t index = - (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0); - for (size_t i = 0; i < kernel_size[0]; i++) { - result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]); - } - } - const size_t tensor_index = - indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) + - indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0], - itemID.get_local_id(1) + output_offset[1]); - - buffer_ptr[tensor_index] = result; - } - } -}; - -template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims, - typename Kernel_accessor, typename Buffer_accessor> -struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor, - Buffer_accessor, convolution_type::CONV3D> { - typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> - Local_accessor; - Local_accessor local_acc; - Evaluator device_evaluator; - Kernel_accessor kernel_filter; - Buffer_accessor buffer_acc; - internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper; - const cl::sycl::range<3> kernel_size; - const cl::sycl::range<3> input_range; - const size_t numP; - - EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, - Buffer_accessor buffer_acc_, - internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_, - const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_, - const size_t numP_) - : local_acc(local_acc_), - device_evaluator(device_evaluator_), - kernel_filter(kernel_filter_), - buffer_acc(buffer_acc_), - indexMapper(indexMapper_), - kernel_size(kernel_size_), - input_range(input_range_), - numP(numP_) {} - template <typename BooleanDim3> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) { - return (boolean_check[0] && boolean_check[1] && boolean_check[2]); - } - void operator()(cl::sycl::nd_item<3> itemID) { - auto buffer_ptr = buffer_acc.get_pointer(); - auto kernel_ptr = kernel_filter.get_pointer(); - const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1}; - - const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()}; - - const auto output_offset = - cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()}; - - for (size_t p = 0; p < numP; p++) { - /// fill the shared memory - const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); - for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) { - size_t local_index_dim2 = num_input[0] * num_input[1] * k; - bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1)); - for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) { - bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1)); - size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2; - for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) { - bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1)); - const size_t local_index = local_index_dim1 + i; - const size_t tensor_index = - plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( - i + input_offset[0], j + input_offset[1], k + input_offset[2]); - local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0); - } - } - } - itemID.barrier(cl::sycl::access::fence_space::local_space); - - // calculate the convolution - - if (boundary_check(itemID.get_global_id() < input_range)) { - CoeffReturnType result = static_cast<CoeffReturnType>(0); - for (size_t k = 0; k < kernel_size[2]; k++) { - for (size_t j = 0; j < kernel_size[1]; j++) { - for (size_t i = 0; i < kernel_size[0]; i++) { - const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k); - const size_t local_index = - ((i + itemID.get_local_id(0)) + - num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2)))); - - result += (local_acc[local_index] * kernel_ptr[kernel_index]); - } - } - } - const size_t tensor_index = - indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) + - indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]); - buffer_ptr[tensor_index] = result; - } - - itemID.barrier(cl::sycl::access::fence_space::local_space); - } - } -}; - -template <typename Indices, typename InputArgType, typename KernelArgType> -struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> { - typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; - - static const int NumDims = - internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value; - static const int NumKernelDims = internal::array_size<Indices>::value; - typedef typename XprType::Index Index; - typedef DSizes<Index, NumDims> Dimensions; - typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions; - typedef const Eigen::SyclDevice Device; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType; - typedef typename InputArgType::Scalar Scalar; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage; - typedef typename Storage::Type EvaluatorPointerType; - typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage; - - enum { - IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned & - TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned, - PacketAccess = false, - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device) - : m_inputImpl(op.inputExpression(), device), - m_kernelArg(op.kernelExpression()), - m_kernelImpl(op.kernelExpression(), device), - m_indices(op.indices()), - m_buf(NULL), - m_kernel(NULL), - m_local_kernel(false), - m_device(device) { - EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) == - static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)), - YOU_MADE_A_PROGRAMMING_MISTAKE); - - const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions(); - const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims = - m_kernelImpl.dimensions(); - - m_dimensions = m_inputImpl.dimensions(); - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = op.indices()[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - m_dimensions[index] = result_dim; - } - } - - EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { - preloadKernel(); - m_inputImpl.evalSubExprsIfNeeded(NULL); - if (data) { - executeEval(data); - return false; - } else { - m_buf = (EvaluatorPointerType)m_device.get( - (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))); - executeEval(m_buf); - return true; - } - } - - EIGEN_STRONG_INLINE void cleanup() { - m_inputImpl.cleanup(); - if (m_buf) { - m_device.deallocate_temp(m_buf); - m_buf = NULL; - } - if (m_local_kernel) { - m_device.deallocate_temp(m_kernel); - m_local_kernel = false; - } - m_kernel = NULL; - } - /// used by sycl in order to build the sycl buffer - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; } - /// used by sycl in order to build the sycl buffer - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { - // Don't make a local copy of the kernel unless we have to (i.e. it's an - // expression that needs to be evaluated) - typename KernelStorage::Type in_place = m_kernelImpl.data(); - if (in_place) { - m_kernel = in_place; - m_local_kernel = false; - } else { - ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); - EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz)); - typedef TensorEvalToOp<const KernelArgType> EvalTo; - EvalTo evalToTmp(m_device.get(local), m_kernelArg); - const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value; - internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device); - m_kernel = local; - m_local_kernel = true; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const { - typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator; - typedef typename InputEvaluator::Dimensions InputDims; - switch (NumKernelDims) { - case 1: { - const size_t numX = dimensions()[m_indices[0]]; - const size_t numP = dimensions().TotalSize() / numX; - const auto input_dim = std::array<size_t, 2>{numX, numP}; - auto global_range = cl::sycl::range<2>{}; - auto local_range = cl::sycl::range<2>{}; - const size_t kernel_size = m_kernelImpl.dimensions().TotalSize(); - - m_device.parallel_for_setup(input_dim, global_range, local_range); - const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]); - gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock()); - const array<Index, 1> indices{{m_indices[0]}}; - const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}}; - internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); - - typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims, - typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D> - ConvKernel; - - m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>( - m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size, - indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1])); - break; - } - - case 2: { - auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1, - static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0}; - auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]], - (size_t)m_kernelImpl.dimensions()[kernel_index[1]]}; - const size_t numX = dimensions()[m_indices[kernel_index[0]]]; - const size_t numY = dimensions()[m_indices[kernel_index[1]]]; - const size_t numP = dimensions().TotalSize() / (numX * numY); - auto input_dim = std::array<size_t, 3>{numX, numY, numP}; - - auto global_range = cl::sycl::range<3>{}; - auto local_range = cl::sycl::range<3>{}; - - m_device.parallel_for_setup(input_dim, global_range, local_range); - - const size_t local_memory_size = - (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2]; - gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock()); - const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}}; - const array<Index, 2> kernel_dims{ - {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}}; - internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); - typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims, - typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D> - ConvKernel; - m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>( - m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, - indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]}); - break; - } - - case 3: { - auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2, - static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1, - static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0}; - - auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]], - (size_t)m_kernelImpl.dimensions()[kernel_index[1]], - (size_t)m_kernelImpl.dimensions()[kernel_index[2]]}; - - const size_t numX = dimensions()[m_indices[kernel_index[0]]]; - const size_t numY = dimensions()[m_indices[kernel_index[1]]]; - const size_t numZ = dimensions()[m_indices[kernel_index[2]]]; - auto input_dim = std::array<size_t, 3>{numX, numY, numZ}; - const size_t numP = dimensions().TotalSize() / (numX * numY * numZ); - - const array<Index, 3> indices{ - {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}}; - const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]], - m_kernelImpl.dimensions()[kernel_index[1]], - m_kernelImpl.dimensions()[kernel_index[2]]}}; - - internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); - - auto global_range = cl::sycl::range<3>{}; - auto local_range = cl::sycl::range<3>{}; - - m_device.parallel_for_setup(input_dim, global_range, local_range); - auto local_memory_range = (local_range + kernel_size - 1); - const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2]; - - gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock()); - typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims, - typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D> - ConvKernel; - m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>( - m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, - indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP); - break; - } - - default: { - EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), - THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_buf != NULL); - eigen_assert(index < m_dimensions.TotalSize()); - return m_buf[index]; - } - - template <int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const { - eigen_assert(m_buf != NULL); - eigen_assert(index < m_dimensions.TotalSize()); - return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost - // model. - const double kernel_size = m_kernelImpl.dimensions().TotalSize(); - // We ignore the use of fused multiply-add. - const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); - const double firstIndex_compute_cost = - NumDims * - (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>()); - return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + - kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize)); - } - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_kernelImpl.bind(cgh); - m_inputImpl.bind(cgh); - m_buf.bind(cgh); - m_kernel.bind(cgh); - } - - private: - // No assignment (copies are needed by the kernels) - TensorEvaluator &operator=(const TensorEvaluator &); - TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl; - KernelArgType m_kernelArg; - TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl; - Indices m_indices; - Dimensions m_dimensions; - EvaluatorPointerType m_buf; - typename KernelStorage::Type m_kernel; - bool m_local_kernel; - const Eigen::SyclDevice EIGEN_DEVICE_REF m_device; -}; // namespace Eigen - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorCostModel.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorCostModel.h deleted file mode 100644 index 195267c..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorCostModel.h +++ /dev/null @@ -1,214 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H -#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H - -namespace Eigen { - -/** \class TensorEvaluator - * \ingroup CXX11_Tensor_Module - * - * \brief A cost model used to limit the number of threads used for evaluating - * tensor expression. - * - */ - -// Class storing the cost of evaluating a tensor expression in terms of the -// estimated number of operand bytes loads, bytes stored, and compute cycles. -class TensorOpCost { - public: - // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple - // model based on minimal reciprocal throughput numbers from Intel or - // Agner Fog's tables would be better than what is there now. - template <typename ArgType> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() { - return internal::functor_traits< - internal::scalar_product_op<ArgType, ArgType> >::Cost; - } - template <typename ArgType> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() { - return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost; - } - template <typename ArgType> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() { - return internal::functor_traits< - internal::scalar_quotient_op<ArgType, ArgType> >::Cost; - } - template <typename ArgType> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() { - return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost; - } - template <typename SrcType, typename TargetType> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() { - return internal::functor_traits< - internal::scalar_cast_op<SrcType, TargetType> >::Cost; - } - - EIGEN_DEVICE_FUNC - TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {} - EIGEN_DEVICE_FUNC - TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles) - : bytes_loaded_(bytes_loaded), - bytes_stored_(bytes_stored), - compute_cycles_(compute_cycles) {} - - EIGEN_DEVICE_FUNC - TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, - bool vectorized, double packet_size) - : bytes_loaded_(bytes_loaded), - bytes_stored_(bytes_stored), - compute_cycles_(vectorized ? compute_cycles / packet_size - : compute_cycles) { - eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded)); - eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored)); - eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { - return bytes_loaded_; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { - return bytes_stored_; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { - return compute_cycles_; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost( - double load_cost, double store_cost, double compute_cost) const { - return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + - compute_cost * compute_cycles_; - } - - // Drop memory access component. Intended for cases when memory accesses are - // sequential or are completely masked by computations. - EIGEN_DEVICE_FUNC void dropMemoryCost() { - bytes_loaded_ = 0; - bytes_stored_ = 0; - } - - // TODO(rmlarsen): Define min in terms of total cost, not elementwise. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin( - const TensorOpCost& rhs) const { - double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded()); - double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored()); - double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles()); - return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); - } - - // TODO(rmlarsen): Define max in terms of total cost, not elementwise. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax( - const TensorOpCost& rhs) const { - double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded()); - double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored()); - double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles()); - return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=( - const TensorOpCost& rhs) { - bytes_loaded_ += rhs.bytes_loaded(); - bytes_stored_ += rhs.bytes_stored(); - compute_cycles_ += rhs.compute_cycles(); - return *this; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) { - bytes_loaded_ *= rhs; - bytes_stored_ *= rhs; - compute_cycles_ *= rhs; - return *this; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+( - TensorOpCost lhs, const TensorOpCost& rhs) { - lhs += rhs; - return lhs; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( - TensorOpCost lhs, double rhs) { - lhs *= rhs; - return lhs; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( - double lhs, TensorOpCost rhs) { - rhs *= lhs; - return rhs; - } - - friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) { - return os << "[bytes_loaded = " << tc.bytes_loaded() - << ", bytes_stored = " << tc.bytes_stored() - << ", compute_cycles = " << tc.compute_cycles() << "]"; - } - - private: - double bytes_loaded_; - double bytes_stored_; - double compute_cycles_; -}; - -// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads -// in [1:max_threads] instead of just switching multi-threading off for small -// work units. -template <typename Device> -class TensorCostModel { - public: - // Scaling from Eigen compute cost to device cycles. - static const int kDeviceCyclesPerComputeCycle = 1; - - // Costs in device cycles. - static const int kStartupCycles = 100000; - static const int kPerThreadCycles = 100000; - static const int kTaskSize = 40000; - - // Returns the number of threads in [1:max_threads] to use for - // evaluating an expression with the given output size and cost per - // coefficient. - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads( - double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { - double cost = totalCost(output_size, cost_per_coeff); - double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; - // Make sure we don't invoke undefined behavior when we convert to an int. - threads = numext::mini<double>(threads, GenericNumTraits<int>::highest()); - return numext::mini(max_threads, - numext::maxi<int>(1, static_cast<int>(threads))); - } - - // taskSize assesses parallel task size. - // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task - // granularity needs to be increased to mitigate parallelization overheads. - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize( - double output_size, const TensorOpCost& cost_per_coeff) { - return totalCost(output_size, cost_per_coeff) / kTaskSize; - } - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost( - double output_size, const TensorOpCost& cost_per_coeff) { - // Cost of memory fetches from L2 cache. 64 is typical cache line size. - // 11 is L2 cache latency on Haswell. - // We don't know whether data is in L1, L2 or L3. But we are most interested - // in single-threaded computational time around 100us-10ms (smaller time - // is too small for parallelization, larger time is not interesting - // either because we are probably using all available threads already). - // And for the target time range, L2 seems to be what matters. Data set - // fitting into L1 is too small to take noticeable time. Data set fitting - // only into L3 presumably will take more than 10ms to load and process. - const double kLoadCycles = 1.0 / 64 * 11; - const double kStoreCycles = 1.0 / 64 * 11; - // Scaling from Eigen compute cost to device cycles. - return output_size * - cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, - kDeviceCyclesPerComputeCycle); - } -}; - -} // namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorCustomOp.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorCustomOp.h deleted file mode 100644 index 95a8a84..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorCustomOp.h +++ /dev/null @@ -1,347 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H -#define EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H - -namespace Eigen { - -/** \class TensorCustomUnaryOp - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor custom class. - * - * - */ -namespace internal { -template<typename CustomUnaryFunc, typename XprType> -struct traits<TensorCustomUnaryOp<CustomUnaryFunc, XprType> > -{ - typedef typename XprType::Scalar Scalar; - typedef typename XprType::StorageKind StorageKind; - typedef typename XprType::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = traits<XprType>::NumDimensions; - static const int Layout = traits<XprType>::Layout; - typedef typename traits<XprType>::PointerType PointerType; -}; - -template<typename CustomUnaryFunc, typename XprType> -struct eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Eigen::Dense> -{ - typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>EIGEN_DEVICE_REF type; -}; - -template<typename CustomUnaryFunc, typename XprType> -struct nested<TensorCustomUnaryOp<CustomUnaryFunc, XprType> > -{ - typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> type; -}; - -} // end namespace internal - - - -template<typename CustomUnaryFunc, typename XprType> -class TensorCustomUnaryOp : public TensorBase<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, ReadOnlyAccessors> -{ - public: - typedef typename internal::traits<TensorCustomUnaryOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename internal::nested<TensorCustomUnaryOp>::type Nested; - typedef typename internal::traits<TensorCustomUnaryOp>::StorageKind StorageKind; - typedef typename internal::traits<TensorCustomUnaryOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomUnaryOp(const XprType& expr, const CustomUnaryFunc& func) - : m_expr(expr), m_func(func) {} - - EIGEN_DEVICE_FUNC - const CustomUnaryFunc& func() const { return m_func; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_expr; } - - protected: - typename XprType::Nested m_expr; - const CustomUnaryFunc m_func; -}; - - -// Eval as rvalue -template<typename CustomUnaryFunc, typename XprType, typename Device> -struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Device> -{ - typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> ArgType; - typedef typename internal::traits<ArgType>::Index Index; - static const int NumDims = internal::traits<ArgType>::NumDimensions; - typedef DSizes<Index, NumDims> Dimensions; - typedef typename internal::remove_const<typename ArgType::Scalar>::type Scalar; - typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - IsAligned = false, - PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorEvaluator<XprType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) - : m_op(op), m_device(device), m_result(NULL) - { - m_dimensions = op.func().dimensions(op.expression()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { - if (data) { - evalTo(data); - return false; - } else { - m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*) - m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)))); - evalTo(m_result); - return true; - } - } - - EIGEN_STRONG_INLINE void cleanup() { - if (m_result) { - m_device.deallocate_temp(m_result); - m_result = NULL; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_result[index]; - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { - return internal::ploadt<PacketReturnType, LoadMode>(m_result + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_result.bind(cgh); - } -#endif - - protected: - void evalTo(EvaluatorPointerType data) { - TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(m_device.get(data), m_dimensions); - m_op.func().eval(m_op.expression(), result, m_device); - } - - Dimensions m_dimensions; - const ArgType m_op; - const Device EIGEN_DEVICE_REF m_device; - EvaluatorPointerType m_result; -}; - - - -/** \class TensorCustomBinaryOp - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor custom class. - * - * - */ -namespace internal { -template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> -struct traits<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> > -{ - typedef typename internal::promote_storage_type<typename LhsXprType::Scalar, - typename RhsXprType::Scalar>::ret Scalar; - typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType, - typename RhsXprType::CoeffReturnType>::ret CoeffReturnType; - typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind, - typename traits<RhsXprType>::StorageKind>::ret StorageKind; - typedef typename promote_index_type<typename traits<LhsXprType>::Index, - typename traits<RhsXprType>::Index>::type Index; - typedef typename LhsXprType::Nested LhsNested; - typedef typename RhsXprType::Nested RhsNested; - typedef typename remove_reference<LhsNested>::type _LhsNested; - typedef typename remove_reference<RhsNested>::type _RhsNested; - static const int NumDimensions = traits<LhsXprType>::NumDimensions; - static const int Layout = traits<LhsXprType>::Layout; - typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val, - typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType; -}; - -template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> -struct eval<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Eigen::Dense> -{ - typedef const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>& type; -}; - -template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> -struct nested<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> > -{ - typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> type; -}; - -} // end namespace internal - - - -template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> -class TensorCustomBinaryOp : public TensorBase<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, ReadOnlyAccessors> -{ - public: - typedef typename internal::traits<TensorCustomBinaryOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename internal::traits<TensorCustomBinaryOp>::CoeffReturnType CoeffReturnType; - typedef typename internal::nested<TensorCustomBinaryOp>::type Nested; - typedef typename internal::traits<TensorCustomBinaryOp>::StorageKind StorageKind; - typedef typename internal::traits<TensorCustomBinaryOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const CustomBinaryFunc& func) - - : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_func(func) {} - - EIGEN_DEVICE_FUNC - const CustomBinaryFunc& func() const { return m_func; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename LhsXprType::Nested>::type& - lhsExpression() const { return m_lhs_xpr; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename RhsXprType::Nested>::type& - rhsExpression() const { return m_rhs_xpr; } - - protected: - typename LhsXprType::Nested m_lhs_xpr; - typename RhsXprType::Nested m_rhs_xpr; - const CustomBinaryFunc m_func; -}; - - -// Eval as rvalue -template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, typename Device> -struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Device> -{ - typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> XprType; - typedef typename internal::traits<XprType>::Index Index; - static const int NumDims = internal::traits<XprType>::NumDimensions; - typedef DSizes<Index, NumDims> Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - - typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - IsAligned = false, - PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorEvaluator<LhsXprType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_op(op), m_device(device), m_result(NULL) - { - m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { - if (data) { - evalTo(data); - return false; - } else { - m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*) - m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType)))); - evalTo(m_result); - return true; - } - } - - EIGEN_STRONG_INLINE void cleanup() { - if (m_result != NULL) { - m_device.deallocate_temp(m_result); - m_result = NULL; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_result[index]; - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { - return internal::ploadt<PacketReturnType, LoadMode>(m_result + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_result.bind(cgh); - } -#endif - - protected: - void evalTo(EvaluatorPointerType data) { - TensorMap<Tensor<CoeffReturnType, NumDims, Layout> > result(m_device.get(data), m_dimensions); - m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device); - } - - Dimensions m_dimensions; - const XprType m_op; - const Device EIGEN_DEVICE_REF m_device; - EvaluatorPointerType m_result; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDevice.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDevice.h deleted file mode 100644 index 96fa46c..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorDevice.h +++ /dev/null @@ -1,137 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H - -namespace Eigen { - -/** \class TensorDevice - * \ingroup CXX11_Tensor_Module - * - * \brief Pseudo expression providing an operator = that will evaluate its argument - * on the specified computing 'device' (GPU, thread pool, ...) - * - * Example: - * C.device(EIGEN_GPU) = A + B; - * - * Todo: operator *= and /=. - */ - -template <typename ExpressionType, typename DeviceType> class TensorDevice { - public: - TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} - - EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorDevice) - - template<typename OtherDerived> - EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { - typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign; - Assign assign(m_expression, other); - internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device); - return *this; - } - - template<typename OtherDerived> - EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { - typedef typename OtherDerived::Scalar Scalar; - typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum; - Sum sum(m_expression, other); - typedef TensorAssignOp<ExpressionType, const Sum> Assign; - Assign assign(m_expression, sum); - internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device); - return *this; - } - - template<typename OtherDerived> - EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) { - typedef typename OtherDerived::Scalar Scalar; - typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference; - Difference difference(m_expression, other); - typedef TensorAssignOp<ExpressionType, const Difference> Assign; - Assign assign(m_expression, difference); - internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device); - return *this; - } - - protected: - const DeviceType& m_device; - ExpressionType& m_expression; -}; - -/** \class TensorAsyncDevice - * \ingroup CXX11_Tensor_Module - * - * \brief Pseudo expression providing an operator = that will evaluate its - * argument asynchronously on the specified device. Currently only - * ThreadPoolDevice implements proper asynchronous execution, while the default - * and GPU devices just run the expression synchronously and call m_done() on - * completion.. - * - * Example: - * auto done = []() { ... expression evaluation done ... }; - * C.device(thread_pool_device, std::move(done)) = A + B; - */ - -template <typename ExpressionType, typename DeviceType, typename DoneCallback> -class TensorAsyncDevice { - public: - TensorAsyncDevice(const DeviceType& device, ExpressionType& expression, - DoneCallback done) - : m_device(device), m_expression(expression), m_done(std::move(done)) {} - - template <typename OtherDerived> - EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) { - typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign; - typedef internal::TensorExecutor<const Assign, DeviceType> Executor; - - Assign assign(m_expression, other); - Executor::run(assign, m_device); - m_done(); - - return *this; - } - - protected: - const DeviceType& m_device; - ExpressionType& m_expression; - DoneCallback m_done; -}; - - -#ifdef EIGEN_USE_THREADS -template <typename ExpressionType, typename DoneCallback> -class TensorAsyncDevice<ExpressionType, ThreadPoolDevice, DoneCallback> { - public: - TensorAsyncDevice(const ThreadPoolDevice& device, ExpressionType& expression, - DoneCallback done) - : m_device(device), m_expression(expression), m_done(std::move(done)) {} - - template <typename OtherDerived> - EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) { - typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign; - typedef internal::TensorAsyncExecutor<const Assign, ThreadPoolDevice, DoneCallback> Executor; - - // WARNING: After assignment 'm_done' callback will be in undefined state. - Assign assign(m_expression, other); - Executor::runAsync(assign, m_device, std::move(m_done)); - - return *this; - } - - protected: - const ThreadPoolDevice& m_device; - ExpressionType& m_expression; - DoneCallback m_done; -}; -#endif - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceCuda.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceCuda.h deleted file mode 100644 index f779239..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceCuda.h +++ /dev/null @@ -1,6 +0,0 @@ - -#if defined(__clang__) || defined(__GNUC__) -#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorDeviceGpu.h file" -#endif - -#include "TensorDeviceGpu.h" diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceDefault.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceDefault.h deleted file mode 100644 index 46b9d3a..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceDefault.h +++ /dev/null @@ -1,104 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H - - -namespace Eigen { - -// Default device for the machine (typically a single cpu core) -struct DefaultDevice { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - return internal::aligned_malloc(num_bytes); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { - internal::aligned_free(buffer); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { - return allocate(num_bytes); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { - deallocate(buffer); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { - ::memcpy(dst, src, n); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { - memcpy(dst, src, n); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { - memcpy(dst, src, n); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { - ::memset(buffer, c, n); - } - template<typename Type> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { - return data; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { -#if !defined(EIGEN_GPU_COMPILE_PHASE) - // Running on the host CPU - return 1; -#elif defined(EIGEN_HIP_DEVICE_COMPILE) - // Running on a HIP device - return 64; -#else - // Running on a CUDA device - return 32; -#endif - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { -#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) - // Running on the host CPU - return l1CacheSize(); -#elif defined(EIGEN_HIP_DEVICE_COMPILE) - // Running on a HIP device - return 48*1024; // FIXME : update this number for HIP -#else - // Running on a CUDA device, return the amount of shared memory available. - return 48*1024; -#endif - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { -#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) - // Running single threaded on the host CPU - return l3CacheSize(); -#elif defined(EIGEN_HIP_DEVICE_COMPILE) - // Running on a HIP device - return firstLevelCacheSize(); // FIXME : update this number for HIP -#else - // Running on a CUDA device - return firstLevelCacheSize(); -#endif - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { -#if !defined(EIGEN_GPU_COMPILE_PHASE) - // Running single threaded on the host CPU - // Should return an enum that encodes the ISA supported by the CPU - return 1; -#elif defined(EIGEN_HIP_DEVICE_COMPILE) - // Running on a HIP device - // return 1 as major for HIP - return 1; -#else - // Running on a CUDA device - return EIGEN_CUDA_ARCH / 100; -#endif - } -}; - -} // namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceGpu.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceGpu.h deleted file mode 100644 index ec2e3cb..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceGpu.h +++ /dev/null @@ -1,389 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H) -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H - -// This header file container defines fo gpu* macros which will resolve to -// their equivalent hip* or cuda* versions depending on the compiler in use -// A separate header (included at the end of this file) will undefine all -#include "TensorGpuHipCudaDefines.h" - -namespace Eigen { - -static const int kGpuScratchSize = 1024; - -// This defines an interface that GPUDevice can take to use -// HIP / CUDA streams underneath. -class StreamInterface { - public: - virtual ~StreamInterface() {} - - virtual const gpuStream_t& stream() const = 0; - virtual const gpuDeviceProp_t& deviceProperties() const = 0; - - // Allocate memory on the actual device where the computation will run - virtual void* allocate(size_t num_bytes) const = 0; - virtual void deallocate(void* buffer) const = 0; - - // Return a scratchpad buffer of size 1k - virtual void* scratchpad() const = 0; - - // Return a semaphore. The semaphore is initially initialized to 0, and - // each kernel using it is responsible for resetting to 0 upon completion - // to maintain the invariant that the semaphore is always equal to 0 upon - // each kernel start. - virtual unsigned int* semaphore() const = 0; -}; - -class GpuDeviceProperties { - public: - GpuDeviceProperties() : - initialized_(false), first_(true), device_properties_(nullptr) {} - - ~GpuDeviceProperties() { - if (device_properties_) { - delete[] device_properties_; - } - } - - EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const { - return device_properties_[device]; - } - - EIGEN_STRONG_INLINE bool isInitialized() const { - return initialized_; - } - - void initialize() { - if (!initialized_) { - // Attempts to ensure proper behavior in the case of multiple threads - // calling this function simultaneously. This would be trivial to - // implement if we could use std::mutex, but unfortunately mutex don't - // compile with nvcc, so we resort to atomics and thread fences instead. - // Note that if the caller uses a compiler that doesn't support c++11 we - // can't ensure that the initialization is thread safe. - if (first_.exchange(false)) { - // We're the first thread to reach this point. - int num_devices; - gpuError_t status = gpuGetDeviceCount(&num_devices); - if (status != gpuSuccess) { - std::cerr << "Failed to get the number of GPU devices: " - << gpuGetErrorString(status) - << std::endl; - gpu_assert(status == gpuSuccess); - } - device_properties_ = new gpuDeviceProp_t[num_devices]; - for (int i = 0; i < num_devices; ++i) { - status = gpuGetDeviceProperties(&device_properties_[i], i); - if (status != gpuSuccess) { - std::cerr << "Failed to initialize GPU device #" - << i - << ": " - << gpuGetErrorString(status) - << std::endl; - gpu_assert(status == gpuSuccess); - } - } - - std::atomic_thread_fence(std::memory_order_release); - initialized_ = true; - } else { - // Wait for the other thread to inititialize the properties. - while (!initialized_) { - std::atomic_thread_fence(std::memory_order_acquire); - std::this_thread::sleep_for(std::chrono::milliseconds(1000)); - } - } - } - } - - private: - volatile bool initialized_; - std::atomic<bool> first_; - gpuDeviceProp_t* device_properties_; -}; - -EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() { - static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties(); - if (!deviceProperties->isInitialized()) { - deviceProperties->initialize(); - } - return *deviceProperties; -} - -EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) { - return GetGpuDeviceProperties().get(device); -} - -static const gpuStream_t default_stream = gpuStreamDefault; - -class GpuStreamDevice : public StreamInterface { - public: - // Use the default stream on the current device - GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { - gpuGetDevice(&device_); - } - // Use the default stream on the specified device - GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {} - // Use the specified stream. Note that it's the - // caller responsibility to ensure that the stream can run on - // the specified device. If no device is specified the code - // assumes that the stream is associated to the current gpu device. - GpuStreamDevice(const gpuStream_t* stream, int device = -1) - : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { - if (device < 0) { - gpuGetDevice(&device_); - } else { - int num_devices; - gpuError_t err = gpuGetDeviceCount(&num_devices); - EIGEN_UNUSED_VARIABLE(err) - gpu_assert(err == gpuSuccess); - gpu_assert(device < num_devices); - device_ = device; - } - } - - virtual ~GpuStreamDevice() { - if (scratch_) { - deallocate(scratch_); - } - } - - const gpuStream_t& stream() const { return *stream_; } - const gpuDeviceProp_t& deviceProperties() const { - return GetGpuDeviceProperties(device_); - } - virtual void* allocate(size_t num_bytes) const { - gpuError_t err = gpuSetDevice(device_); - EIGEN_UNUSED_VARIABLE(err) - gpu_assert(err == gpuSuccess); - void* result; - err = gpuMalloc(&result, num_bytes); - gpu_assert(err == gpuSuccess); - gpu_assert(result != NULL); - return result; - } - virtual void deallocate(void* buffer) const { - gpuError_t err = gpuSetDevice(device_); - EIGEN_UNUSED_VARIABLE(err) - gpu_assert(err == gpuSuccess); - gpu_assert(buffer != NULL); - err = gpuFree(buffer); - gpu_assert(err == gpuSuccess); - } - - virtual void* scratchpad() const { - if (scratch_ == NULL) { - scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int)); - } - return scratch_; - } - - virtual unsigned int* semaphore() const { - if (semaphore_ == NULL) { - char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize; - semaphore_ = reinterpret_cast<unsigned int*>(scratch); - gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); - EIGEN_UNUSED_VARIABLE(err) - gpu_assert(err == gpuSuccess); - } - return semaphore_; - } - - private: - const gpuStream_t* stream_; - int device_; - mutable void* scratch_; - mutable unsigned int* semaphore_; -}; - -struct GpuDevice { - // The StreamInterface is not owned: the caller is - // responsible for its initialization and eventual destruction. - explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { - eigen_assert(stream); - } - explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { - eigen_assert(stream); - } - // TODO(bsteiner): This is an internal API, we should not expose it. - EIGEN_STRONG_INLINE const gpuStream_t& stream() const { - return stream_->stream(); - } - - EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - return stream_->allocate(num_bytes); - } - - EIGEN_STRONG_INLINE void deallocate(void* buffer) const { - stream_->deallocate(buffer); - } - - EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { - return stream_->allocate(num_bytes); - } - - EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { - stream_->deallocate(buffer); - } - - template<typename Type> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { - return data; - } - - EIGEN_STRONG_INLINE void* scratchpad() const { - return stream_->scratchpad(); - } - - EIGEN_STRONG_INLINE unsigned int* semaphore() const { - return stream_->semaphore(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { -#ifndef EIGEN_GPU_COMPILE_PHASE - gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice, - stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - gpu_assert(err == gpuSuccess); -#else - EIGEN_UNUSED_VARIABLE(dst); - EIGEN_UNUSED_VARIABLE(src); - EIGEN_UNUSED_VARIABLE(n); - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { - gpuError_t err = - gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - gpu_assert(err == gpuSuccess); - } - - EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { - gpuError_t err = - gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - gpu_assert(err == gpuSuccess); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { -#ifndef EIGEN_GPU_COMPILE_PHASE - gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - gpu_assert(err == gpuSuccess); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE size_t numThreads() const { - // FIXME - return 32; - } - - EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { - // FIXME - return 48*1024; - } - - EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { - // We won't try to take advantage of the l2 cache for the time being, and - // there is no l3 cache on hip/cuda devices. - return firstLevelCacheSize(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { -#ifndef EIGEN_GPU_COMPILE_PHASE - gpuError_t err = gpuStreamSynchronize(stream_->stream()); - if (err != gpuSuccess) { - std::cerr << "Error detected in GPU stream: " - << gpuGetErrorString(err) - << std::endl; - gpu_assert(err == gpuSuccess); - } -#else - gpu_assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const { - return stream_->deviceProperties().multiProcessorCount; - } - EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const { - return stream_->deviceProperties().maxThreadsPerBlock; - } - EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const { - return stream_->deviceProperties().maxThreadsPerMultiProcessor; - } - EIGEN_STRONG_INLINE int sharedMemPerBlock() const { - return stream_->deviceProperties().sharedMemPerBlock; - } - EIGEN_STRONG_INLINE int majorDeviceVersion() const { - return stream_->deviceProperties().major; - } - EIGEN_STRONG_INLINE int minorDeviceVersion() const { - return stream_->deviceProperties().minor; - } - - EIGEN_STRONG_INLINE int maxBlocks() const { - return max_blocks_; - } - - // This function checks if the GPU runtime recorded an error for the - // underlying stream device. - inline bool ok() const { -#ifdef EIGEN_GPUCC - gpuError_t error = gpuStreamQuery(stream_->stream()); - return (error == gpuSuccess) || (error == gpuErrorNotReady); -#else - return false; -#endif - } - - private: - const StreamInterface* stream_; - int max_blocks_; -}; - -#if defined(EIGEN_HIPCC) - -#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ - hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \ - gpu_assert(hipGetLastError() == hipSuccess); - -#else - -#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ - (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ - gpu_assert(cudaGetLastError() == cudaSuccess); - -#endif - -// FIXME: Should be device and kernel specific. -#ifdef EIGEN_GPUCC -static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) { -#ifndef EIGEN_GPU_COMPILE_PHASE - gpuError_t status = gpuDeviceSetSharedMemConfig(config); - EIGEN_UNUSED_VARIABLE(status) - gpu_assert(status == gpuSuccess); -#else - EIGEN_UNUSED_VARIABLE(config) -#endif -} -#endif - -} // end namespace Eigen - -// undefine all the gpu* macros we defined at the beginning of the file -#include "TensorGpuHipCudaUndefines.h" - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceSycl.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceSycl.h deleted file mode 100644 index df591c2..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceSycl.h +++ /dev/null @@ -1,1048 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> - -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H -#include <unordered_set> - -namespace Eigen { - -namespace TensorSycl { -namespace internal { - -/// Cache all the device information needed -struct SyclDeviceInfo { - SyclDeviceInfo(cl::sycl::queue queue) - : local_mem_type( - queue.get_device() - .template get_info<cl::sycl::info::device::local_mem_type>()), - max_work_item_sizes( - queue.get_device() - .template get_info< - cl::sycl::info::device::max_work_item_sizes>()), - max_mem_alloc_size( - queue.get_device() - .template get_info< - cl::sycl::info::device::max_mem_alloc_size>()), - max_compute_units(queue.get_device() - .template get_info< - cl::sycl::info::device::max_compute_units>()), - max_work_group_size( - queue.get_device() - .template get_info< - cl::sycl::info::device::max_work_group_size>()), - local_mem_size( - queue.get_device() - .template get_info<cl::sycl::info::device::local_mem_size>()), - platform_name(queue.get_device() - .get_platform() - .template get_info<cl::sycl::info::platform::name>()), - device_name(queue.get_device() - .template get_info<cl::sycl::info::device::name>()), - device_vendor( - queue.get_device() - .template get_info<cl::sycl::info::device::vendor>()) {} - - cl::sycl::info::local_mem_type local_mem_type; - cl::sycl::id<3> max_work_item_sizes; - unsigned long max_mem_alloc_size; - unsigned long max_compute_units; - unsigned long max_work_group_size; - size_t local_mem_size; - std::string platform_name; - std::string device_name; - std::string device_vendor; -}; - -} // end namespace internal -} // end namespace TensorSycl - -typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t; -// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and -// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently -// TensorFlow via the Eigen SYCL Backend. -EIGEN_STRONG_INLINE auto get_sycl_supported_devices() - -> decltype(cl::sycl::device::get_devices()) { -#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR - return {cl::sycl::device(cl::sycl::default_selector())}; -#else - std::vector<cl::sycl::device> supported_devices; - auto platform_list = cl::sycl::platform::get_platforms(); - for (const auto &platform : platform_list) { - auto device_list = platform.get_devices(); - auto platform_name = - platform.template get_info<cl::sycl::info::platform::name>(); - std::transform(platform_name.begin(), platform_name.end(), - platform_name.begin(), ::tolower); - for (const auto &device : device_list) { - auto vendor = device.template get_info<cl::sycl::info::device::vendor>(); - std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower); - bool unsupported_condition = - (device.is_cpu() && platform_name.find("amd") != std::string::npos && - vendor.find("apu") == std::string::npos) || - (platform_name.find("experimental") != std::string::npos) || - device.is_host(); - if (!unsupported_condition) { - supported_devices.push_back(device); - } - } - } - return supported_devices; -#endif -} - -class QueueInterface { - public: - /// Creating device by using cl::sycl::selector or cl::sycl::device. - template <typename DeviceOrSelector> - explicit QueueInterface( - const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler, - unsigned num_threads = std::thread::hardware_concurrency()) - : m_queue(dev_or_sel, handler), -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - m_prog(m_queue.get_context(), get_sycl_supported_devices()), -#endif - m_thread_pool(num_threads), - m_device_info(m_queue) { -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - m_prog.build_with_kernel_type<DeviceOrSelector>(); - auto f = [&](cl::sycl::handler &cgh) { - cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(), - [=]() {}) - }; - EIGEN_SYCL_TRY_CATCH(m_queue.submit(f)); -#endif - } - - template <typename DeviceOrSelector> - explicit QueueInterface( - const DeviceOrSelector &dev_or_sel, - unsigned num_threads = std::thread::hardware_concurrency()) - : QueueInterface(dev_or_sel, - [this](cl::sycl::exception_list l) { - this->exception_caught_ = this->sycl_async_handler(l); - }, - num_threads) {} - -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; } -#endif - - /// Attach an existing buffer to the pointer map, Eigen will not reuse it - EIGEN_STRONG_INLINE void *attach_buffer( - cl::sycl::buffer<buffer_scalar_t, 1> &buf) const { - std::lock_guard<std::mutex> lock(pmapper_mutex_); - return static_cast<void *>(pMapper.add_pointer(buf)); - } - - /// Detach previously attached buffer - EIGEN_STRONG_INLINE void detach_buffer(void *p) const { - std::lock_guard<std::mutex> lock(pmapper_mutex_); - TensorSycl::internal::SYCLfree<false>(p, pMapper); - } - - /// Allocating device pointer. This pointer is actually an 8 bytes host - /// pointer used as key to access the sycl device buffer. The reason is that - /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode - /// expressions. So we create a key pointer to be used in Eigen expression - /// construction. When we convert the Eigen construction into the sycl - /// construction we use this pointer as a key in our buffer_map and we make - /// sure that we dedicate only one buffer only for this pointer. The device - /// pointer would be deleted by calling deallocate function. - EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { -#if EIGEN_MAX_ALIGN_BYTES > 0 - size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES; - if (align > 0) { - num_bytes += EIGEN_MAX_ALIGN_BYTES - align; - } -#endif - std::lock_guard<std::mutex> lock(pmapper_mutex_); - return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); - } - - EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { -#if EIGEN_MAX_ALIGN_BYTES > 0 - size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES; - if (align > 0) { - num_bytes += EIGEN_MAX_ALIGN_BYTES - align; - } -#endif - std::lock_guard<std::mutex> lock(pmapper_mutex_); -#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS - if (scratch_buffers.empty()) { - return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); - ; - } else { - for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) { - auto buff = pMapper.get_buffer(*it); - if (buff.get_size() >= num_bytes) { - auto ptr = *it; - scratch_buffers.erase(it); - return ptr; - } else { - ++it; - } - } - return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); - } -#else - return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); -#endif - } - template <typename data_t> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess< - cl::sycl::access::mode::read_write, data_t> - get(data_t *data) const { - return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data); - } - template <typename data_t> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get( - TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, - data_t> - data) const { - return static_cast<data_t *>(data.get_virtual_pointer()); - } - - EIGEN_STRONG_INLINE void deallocate_temp(void *p) const { - std::lock_guard<std::mutex> lock(pmapper_mutex_); -#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS - scratch_buffers.insert(p); -#else - TensorSycl::internal::SYCLfree(p, pMapper); -#endif - } - template <cl::sycl::access::mode AcMd, typename T> - EIGEN_STRONG_INLINE void deallocate_temp( - const TensorSycl::internal::RangeAccess<AcMd, T> &p) const { - deallocate_temp(p.get_virtual_pointer()); - } - - /// This is used to deallocate the device pointer. p is used as a key inside - /// the map to find the device buffer and delete it. - EIGEN_STRONG_INLINE void deallocate(void *p) const { - std::lock_guard<std::mutex> lock(pmapper_mutex_); - TensorSycl::internal::SYCLfree(p, pMapper); - } - - EIGEN_STRONG_INLINE void deallocate_all() const { - std::lock_guard<std::mutex> lock(pmapper_mutex_); - TensorSycl::internal::SYCLfreeAll(pMapper); -#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS - scratch_buffers.clear(); -#endif - } - - /// The memcpyHostToDevice is used to copy the data from host to device - /// The destination pointer could be deleted before the copy happend which is - /// why a callback function is needed. By default if none is provided, the - /// function is blocking. - EIGEN_STRONG_INLINE void memcpyHostToDevice( - void *dst, const void *src, size_t n, - std::function<void()> callback) const { - static const auto write_mode = cl::sycl::access::mode::discard_write; - static const auto global_access = cl::sycl::access::target::global_buffer; - typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access> - write_accessor; - if (n == 0) { - if (callback) callback(); - return; - } - n /= sizeof(buffer_scalar_t); - auto f = [&](cl::sycl::handler &cgh) { - write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n); - buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src); - auto non_deleter = [](buffer_scalar_t const *) {}; - std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter); - cgh.copy(s_ptr, dst_acc); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); - synchronize_and_callback(e, callback); - } - - /// The memcpyDeviceToHost is used to copy the data from device to host. - /// The source pointer could be deleted before the copy happend which is - /// why a callback function is needed. By default if none is provided, the - /// function is blocking. - EIGEN_STRONG_INLINE void memcpyDeviceToHost( - void *dst, const void *src, size_t n, - std::function<void()> callback) const { - static const auto read_mode = cl::sycl::access::mode::read; - static const auto global_access = cl::sycl::access::target::global_buffer; - typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access> - read_accessor; - if (n == 0) { - if (callback) callback(); - return; - } - n /= sizeof(buffer_scalar_t); - auto f = [&](cl::sycl::handler &cgh) { - read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n); - buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst); - auto non_deleter = [](buffer_scalar_t *) {}; - std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter); - cgh.copy(src_acc, s_ptr); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); - synchronize_and_callback(e, callback); - } - - /// The memcpy function. - /// No callback is required here as both arguments are on the device - /// and SYCL can handle the dependency. - EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { - static const auto read_mode = cl::sycl::access::mode::read; - static const auto write_mode = cl::sycl::access::mode::discard_write; - if (n == 0) { - return; - } - n /= sizeof(buffer_scalar_t); - auto f = [&](cl::sycl::handler &cgh) { - auto src_acc = get_range_accessor<read_mode>(cgh, src, n); - auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n); - cgh.copy(src_acc, dst_acc); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); - async_synchronize(e); - } - - /// the memset function. - /// No callback is required here as both arguments are on the device - /// and SYCL can handle the dependency. - EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { - static const auto write_mode = cl::sycl::access::mode::discard_write; - if (n == 0) { - return; - } - n /= sizeof(buffer_scalar_t); - auto f = [&](cl::sycl::handler &cgh) { - auto dst_acc = get_range_accessor<write_mode>(cgh, data, n); - // The cast to uint8_t is here to match the behaviour of the standard - // memset. The cast to buffer_scalar_t is needed to match the type of the - // accessor (in case buffer_scalar_t is not uint8_t) - cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c))); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); - async_synchronize(e); - } - - /// Get a range accessor to the virtual pointer's device memory. This range - /// accessor will allow access to the memory from the pointer to the end of - /// the buffer. - /// - /// NOTE: Inside a kernel the range accessor will always be indexed from the - /// start of the buffer, so the offset in the accessor is only used by - /// methods like handler::copy and will not be available inside a kernel. - template <cl::sycl::access::mode AcMd, typename T> - EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T> - get_range_accessor(const void *ptr) const { - static const auto global_access = cl::sycl::access::target::global_buffer; - static const auto is_place_holder = cl::sycl::access::placeholder::true_t; - typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type; - typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t; - - std::lock_guard<std::mutex> lock(pmapper_mutex_); - - auto original_buffer = pMapper.get_buffer(ptr); - const ptrdiff_t offset = pMapper.get_offset(ptr); - const ptrdiff_t typed_offset = offset / sizeof(T); - eigen_assert(typed_offset >= 0); - const auto typed_size = original_buffer.get_size() / sizeof(T); - auto buffer = original_buffer.template reinterpret< - typename Eigen::internal::remove_const<T>::type>( - cl::sycl::range<1>(typed_size)); - const ptrdiff_t size = buffer.get_count() - typed_offset; - eigen_assert(size >= 0); - typedef cl::sycl::accessor<typename Eigen::internal::remove_const<T>::type, - 1, AcMd, global_access, is_place_holder> - placeholder_accessor_t; - const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset; - return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size), - cl::sycl::id<1>(typed_offset)), - static_cast<size_t>(typed_offset), - reinterpret_cast<std::intptr_t>(start_ptr)); - } - - /// Get a range accessor to the virtual pointer's device memory with a - /// specified size. - template <cl::sycl::access::mode AcMd, typename Index> - EIGEN_STRONG_INLINE cl::sycl::accessor< - buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> - get_range_accessor(cl::sycl::handler &cgh, const void *ptr, - const Index n_bytes) const { - static const auto global_access = cl::sycl::access::target::global_buffer; - eigen_assert(n_bytes >= 0); - std::lock_guard<std::mutex> lock(pmapper_mutex_); - auto buffer = pMapper.get_buffer(ptr); - const ptrdiff_t offset = pMapper.get_offset(ptr); - eigen_assert(offset >= 0); - eigen_assert(offset + n_bytes <= buffer.get_size()); - return buffer.template get_access<AcMd, global_access>( - cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset)); - } - - /// Creation of sycl accessor for a buffer. This function first tries to find - /// the buffer in the buffer_map. If found it gets the accessor from it, if - /// not, the function then adds an entry by creating a sycl buffer for that - /// particular pointer. - template <cl::sycl::access::mode AcMd> - EIGEN_STRONG_INLINE cl::sycl::accessor< - buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> - get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const { - std::lock_guard<std::mutex> lock(pmapper_mutex_); - return pMapper.get_buffer(ptr) - .template get_access<AcMd, cl::sycl::access::target::global_buffer>( - cgh); - } - - EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer( - const void *ptr) const { - std::lock_guard<std::mutex> lock(pmapper_mutex_); - return pMapper.get_buffer(ptr); - } - - EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { - std::lock_guard<std::mutex> lock(pmapper_mutex_); - return pMapper.get_offset(ptr); - } - - template <typename OutScalar, typename sycl_kernel, typename Lhs, - typename Rhs, typename OutPtr, typename Range, typename Index, - typename... T> - EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs, - const Rhs &rhs, OutPtr outptr, - Range thread_range, - Index scratchSize, - T... var) const { - auto kernel_functor = [=](cl::sycl::handler &cgh) { - // binding the placeholder accessors to a commandgroup handler - lhs.bind(cgh); - rhs.bind(cgh); - outptr.bind(cgh); - typedef cl::sycl::accessor<OutScalar, 1, - cl::sycl::access::mode::read_write, - cl::sycl::access::target::local> - LocalAccessor; - - LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); - cgh.parallel_for( -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - program().template get_kernel<sycl_kernel>(), -#endif - thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...)); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); - async_synchronize(e); - } - - template <typename OutScalar, typename sycl_kernel, typename InPtr, - typename OutPtr, typename Range, typename Index, typename... T> - EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr, - OutPtr &outptr, - Range thread_range, - Index scratchSize, - T... var) const { - auto kernel_functor = [=](cl::sycl::handler &cgh) { - // binding the placeholder accessors to a commandgroup handler - inptr.bind(cgh); - outptr.bind(cgh); - typedef cl::sycl::accessor<OutScalar, 1, - cl::sycl::access::mode::read_write, - cl::sycl::access::target::local> - LocalAccessor; - - LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); - cgh.parallel_for( -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - program().template get_kernel<sycl_kernel>(), -#endif - thread_range, sycl_kernel(scratch, inptr, outptr, var...)); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); - async_synchronize(e); - } - - template <typename OutScalar, typename sycl_kernel, typename InPtr, - typename Range, typename Index, typename... T> - EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr, - Range thread_range, - Index scratchSize, - T... var) const { - auto kernel_functor = [=](cl::sycl::handler &cgh) { - // binding the placeholder accessors to a commandgroup handler - inptr.bind(cgh); - typedef cl::sycl::accessor<OutScalar, 1, - cl::sycl::access::mode::read_write, - cl::sycl::access::target::local> - LocalAccessor; - - LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); - cgh.parallel_for( -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - program().template get_kernel<sycl_kernel>(), -#endif - thread_range, sycl_kernel(scratch, inptr, var...)); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); - async_synchronize(e); - } - - - EIGEN_STRONG_INLINE void synchronize() const { -#ifdef EIGEN_EXCEPTIONS - m_queue.wait_and_throw(); -#else - m_queue.wait(); -#endif - } - - - EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const { - set_latest_event(e); -#ifndef EIGEN_SYCL_ASYNC_EXECUTION - synchronize(); -#endif - } - - template <typename Index> - EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, - Index &rng, Index &GRange) const { - tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize()); - tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 * - EIGEN_SYCL_LOCAL_THREAD_DIM1), - static_cast<Index>(tileSize)); - rng = n; - if (rng == 0) rng = static_cast<Index>(1); - GRange = rng; - if (tileSize > GRange) - tileSize = GRange; - else if (GRange > tileSize) { - Index xMode = static_cast<Index>(GRange % tileSize); - if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode); - } - } - - /// This is used to prepare the number of threads and also the number of - /// threads per block for sycl kernels - template <typename Index> - EIGEN_STRONG_INLINE void parallel_for_setup( - const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range, - cl::sycl::range<2> &local_range) const { - std::array<Index, 2> input_range = input_dim; - Index max_workgroup_Size = - static_cast<Index>(getNearestPowerOfTwoWorkGroupSize()); - max_workgroup_Size = - std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 * - EIGEN_SYCL_LOCAL_THREAD_DIM1), - static_cast<Index>(max_workgroup_Size)); - Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); - local_range[1] = - static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2))); - input_range[1] = input_dim[1]; - if (input_range[1] == 0) input_range[1] = static_cast<Index>(1); - global_range[1] = input_range[1]; - if (local_range[1] > global_range[1]) - local_range[1] = global_range[1]; - else if (global_range[1] > local_range[1]) { - Index xMode = static_cast<Index>(global_range[1] % local_range[1]); - if (xMode != 0) - global_range[1] += static_cast<Index>(local_range[1] - xMode); - } - local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]); - input_range[0] = input_dim[0]; - if (input_range[0] == 0) input_range[0] = static_cast<Index>(1); - global_range[0] = input_range[0]; - if (local_range[0] > global_range[0]) - local_range[0] = global_range[0]; - else if (global_range[0] > local_range[0]) { - Index xMode = static_cast<Index>(global_range[0] % local_range[0]); - if (xMode != 0) - global_range[0] += static_cast<Index>(local_range[0] - xMode); - } - } - - /// This is used to prepare the number of threads and also the number of - /// threads per block for sycl kernels - template <typename Index> - EIGEN_STRONG_INLINE void parallel_for_setup( - const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range, - cl::sycl::range<3> &local_range) const { - std::array<Index, 3> input_range = input_dim; - Index max_workgroup_Size = - static_cast<Index>(getNearestPowerOfTwoWorkGroupSize()); - max_workgroup_Size = - std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 * - EIGEN_SYCL_LOCAL_THREAD_DIM1), - static_cast<Index>(max_workgroup_Size)); - Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); - local_range[2] = - static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3))); - input_range[2] = input_dim[2]; - if (input_range[2] == 0) input_range[1] = static_cast<Index>(1); - global_range[2] = input_range[2]; - if (local_range[2] > global_range[2]) - local_range[2] = global_range[2]; - else if (global_range[2] > local_range[2]) { - Index xMode = static_cast<Index>(global_range[2] % local_range[2]); - if (xMode != 0) - global_range[2] += static_cast<Index>(local_range[2] - xMode); - } - pow_of_2 = static_cast<Index>( - std::log2(static_cast<Index>(max_workgroup_Size / local_range[2]))); - local_range[1] = - static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2))); - input_range[1] = input_dim[1]; - if (input_range[1] == 0) input_range[1] = static_cast<Index>(1); - global_range[1] = input_range[1]; - if (local_range[1] > global_range[1]) - local_range[1] = global_range[1]; - else if (global_range[1] > local_range[1]) { - Index xMode = static_cast<Index>(global_range[1] % local_range[1]); - if (xMode != 0) - global_range[1] += static_cast<Index>(local_range[1] - xMode); - } - local_range[0] = static_cast<Index>(max_workgroup_Size / - (local_range[1] * local_range[2])); - input_range[0] = input_dim[0]; - if (input_range[0] == 0) input_range[0] = static_cast<Index>(1); - global_range[0] = input_range[0]; - if (local_range[0] > global_range[0]) - local_range[0] = global_range[0]; - else if (global_range[0] > local_range[0]) { - Index xMode = static_cast<Index>(global_range[0] % local_range[0]); - if (xMode != 0) - global_range[0] += static_cast<Index>(local_range[0] - xMode); - } - } - - EIGEN_STRONG_INLINE bool has_local_memory() const { -#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) - return false; -#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) - return true; -#else - return m_device_info.local_mem_type == - cl::sycl::info::local_mem_type::local; -#endif - } - - EIGEN_STRONG_INLINE unsigned long max_buffer_size() const { - return m_device_info.max_mem_alloc_size; - } - - EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { - return m_device_info.max_compute_units; - } - - EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { - return m_device_info.max_work_group_size; - } - - EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { - return m_device_info.max_work_item_sizes; - } - - /// No need for sycl it should act the same as CPU version - EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } - - EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { - // OpenCL doesnot have such concept - return 2; - } - - EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { - return m_device_info.local_mem_size; - } - - // This function returns the nearest power of 2 Work-group size which is <= - // maximum device workgroup size. - EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { - return getPowerOfTwo(m_device_info.max_work_group_size, false); - } - - EIGEN_STRONG_INLINE std::string getPlatformName() const { - return m_device_info.platform_name; - } - - EIGEN_STRONG_INLINE std::string getDeviceName() const { - return m_device_info.device_name; - } - - EIGEN_STRONG_INLINE std::string getDeviceVendor() const { - return m_device_info.device_vendor; - } - - // This function returns the nearest power of 2 - // if roundup is true returns result>=wgsize - // else it return result <= wgsize - EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const { - if (roundUp) --wGSize; - wGSize |= (wGSize >> 1); - wGSize |= (wGSize >> 2); - wGSize |= (wGSize >> 4); - wGSize |= (wGSize >> 8); - wGSize |= (wGSize >> 16); -#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64 - wGSize |= (wGSize >> 32); -#endif - return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize); - } - - EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; } - - // This function checks if the runtime recorded an error for the - // underlying stream device. - EIGEN_STRONG_INLINE bool ok() const { - if (!exception_caught_) { - synchronize(); - } - return !exception_caught_; - } - - EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const { -#ifdef EIGEN_SYCL_STORE_LATEST_EVENT - std::lock_guard<std::mutex> lock(event_mutex_); - return latest_events_[std::this_thread::get_id()]; -#else - eigen_assert(false); - return cl::sycl::event(); -#endif - } - - // destructor - ~QueueInterface() { - pMapper.clear(); -#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS - scratch_buffers.clear(); -#endif - } - - protected: - EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const { -#ifdef EIGEN_SYCL_STORE_LATEST_EVENT - std::lock_guard<std::mutex> lock(event_mutex_); - latest_events_[std::this_thread::get_id()] = e; -#else - EIGEN_UNUSED_VARIABLE(e); -#endif - } - - void synchronize_and_callback(cl::sycl::event e, - const std::function<void()> &callback) const { - set_latest_event(e); - if (callback) { - auto callback_ = [=]() { -#ifdef EIGEN_EXCEPTIONS - cl::sycl::event(e).wait_and_throw(); -#else - cl::sycl::event(e).wait(); -#endif - callback(); - }; - m_thread_pool.Schedule(std::move(callback_)); - } else { -#ifdef EIGEN_EXCEPTIONS - m_queue.wait_and_throw(); -#else - m_queue.wait(); -#endif - } - } - - bool sycl_async_handler(cl::sycl::exception_list exceptions) const { - bool exception_caught = false; - for (const auto &e : exceptions) { - if (e) { - exception_caught = true; - EIGEN_THROW_X(e); - } - } - return exception_caught; - } - - /// class members: - bool exception_caught_ = false; - - mutable std::mutex pmapper_mutex_; - -#ifdef EIGEN_SYCL_STORE_LATEST_EVENT - mutable std::mutex event_mutex_; - mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_; -#endif - - /// std::map is the container used to make sure that we create only one buffer - /// per pointer. The lifespan of the buffer now depends on the lifespan of - /// SyclDevice. If a non-read-only pointer is needed to be accessed on the - /// host we should manually deallocate it. - mutable TensorSycl::internal::PointerMapper pMapper; -#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS - mutable std::unordered_set<void *> scratch_buffers; -#endif - /// sycl queue - mutable cl::sycl::queue m_queue; -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - mutable cl::sycl::program m_prog; -#endif - - /// The thread pool is used to wait on events and call callbacks - /// asynchronously - mutable Eigen::ThreadPool m_thread_pool; - - const TensorSycl::internal::SyclDeviceInfo m_device_info; -}; - -struct SyclDeviceBase { - /// QueueInterface is not owned. it is the caller's responsibility to destroy - /// it - const QueueInterface *m_queue_stream; - explicit SyclDeviceBase(const QueueInterface *queue_stream) - : m_queue_stream(queue_stream) {} - EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const { - return m_queue_stream; - } -}; - -// Here is a sycl device struct which accept the sycl queue interface -// as an input -struct SyclDevice : public SyclDeviceBase { - explicit SyclDevice(const QueueInterface *queue_stream) - : SyclDeviceBase(queue_stream) {} - - // this is the accessor used to construct the evaluator - template <cl::sycl::access::mode AcMd, typename T> - EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T> - get_range_accessor(const void *ptr) const { - return queue_stream()->template get_range_accessor<AcMd, T>(ptr); - } - - // get sycl accessor - template <cl::sycl::access::mode AcMd> - EIGEN_STRONG_INLINE cl::sycl::accessor< - buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> - get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const { - return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr); - } - - /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer( - const void *ptr) const { - return queue_stream()->get_sycl_buffer(ptr); - } - - /// This is used to prepare the number of threads and also the number of - /// threads per block for sycl kernels - template <typename Index> - EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, - Index &rng, Index &GRange) const { - queue_stream()->parallel_for_setup(n, tileSize, rng, GRange); - } - - /// This is used to prepare the number of threads and also the number of - /// threads per block for sycl kernels - template <typename Index> - EIGEN_STRONG_INLINE void parallel_for_setup( - const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range, - cl::sycl::range<2> &local_range) const { - queue_stream()->parallel_for_setup(input_dim, global_range, local_range); - } - - /// This is used to prepare the number of threads and also the number of - /// threads per block for sycl kernels - template <typename Index> - EIGEN_STRONG_INLINE void parallel_for_setup( - const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range, - cl::sycl::range<3> &local_range) const { - queue_stream()->parallel_for_setup(input_dim, global_range, local_range); - } - - /// allocate device memory - EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { - return queue_stream()->allocate(num_bytes); - } - - EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { - return queue_stream()->allocate_temp(num_bytes); - } - - /// deallocate device memory - EIGEN_STRONG_INLINE void deallocate(void *p) const { - queue_stream()->deallocate(p); - } - - EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const { - queue_stream()->deallocate_temp(buffer); - } - template <cl::sycl::access::mode AcMd, typename T> - EIGEN_STRONG_INLINE void deallocate_temp( - const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const { - queue_stream()->deallocate_temp(buffer); - } - EIGEN_STRONG_INLINE void deallocate_all() const { - queue_stream()->deallocate_all(); - } - - template <typename data_t> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess< - cl::sycl::access::mode::read_write, data_t> - get(data_t *data) const { - return queue_stream()->get(data); - } - template <typename data_t> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get( - TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, - data_t> - data) const { - return queue_stream()->get(data); - } - - /// attach existing buffer - EIGEN_STRONG_INLINE void *attach_buffer( - cl::sycl::buffer<buffer_scalar_t, 1> &buf) const { - return queue_stream()->attach_buffer(buf); - } - /// detach buffer - EIGEN_STRONG_INLINE void detach_buffer(void *p) const { - queue_stream()->detach_buffer(p); - } - EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { - return queue_stream()->get_offset(ptr); - } - - // some runtime conditions that can be applied here - EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; } - - /// memcpyHostToDevice - template <typename Index> - EIGEN_STRONG_INLINE void memcpyHostToDevice( - Index *dst, const Index *src, size_t n, - std::function<void()> callback = {}) const { - queue_stream()->memcpyHostToDevice(dst, src, n, callback); - } - /// memcpyDeviceToHost - template <typename Index> - EIGEN_STRONG_INLINE void memcpyDeviceToHost( - void *dst, const Index *src, size_t n, - std::function<void()> callback = {}) const { - queue_stream()->memcpyDeviceToHost(dst, src, n, callback); - } - /// the memcpy function - template <typename Index> - EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { - queue_stream()->memcpy(dst, src, n); - } - /// the memset function - EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { - queue_stream()->memset(data, c, n); - } - /// returning the sycl queue - EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { - return queue_stream()->sycl_queue(); - } -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - EIGEN_STRONG_INLINE cl::sycl::program &program() const { - return queue_stream()->program(); - } -#endif - - EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; } - - EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { - // We won't try to take advantage of the l2 cache for the time being, and - // there is no l3 cache on sycl devices. - return firstLevelCacheSize(); - } - EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { - return queue_stream()->getNumSyclMultiProcessors(); - } - EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { - return queue_stream()->maxSyclThreadsPerBlock(); - } - EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { - return queue_stream()->maxWorkItemSizes(); - } - EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { - // OpenCL doesnot have such concept - return queue_stream()->maxSyclThreadsPerMultiProcessor(); - } - EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { - return queue_stream()->sharedMemPerBlock(); - } - EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { - return queue_stream()->getNearestPowerOfTwoWorkGroupSize(); - } - - EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const { - return queue_stream()->getPowerOfTwo(val, roundUp); - } - /// No need for sycl it should act the same as CPU version - EIGEN_STRONG_INLINE int majorDeviceVersion() const { - return queue_stream()->majorDeviceVersion(); - } - - EIGEN_STRONG_INLINE void synchronize() const { - queue_stream()->synchronize(); - } - EIGEN_STRONG_INLINE void async_synchronize( - cl::sycl::event e = cl::sycl::event()) const { - queue_stream()->async_synchronize(e); - } - EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const { - return queue_stream()->get_latest_event(); - } - - // This function checks if the runtime recorded an error for the - // underlying stream device. - EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); } - - EIGEN_STRONG_INLINE bool has_local_memory() const { - return queue_stream()->has_local_memory(); - } - EIGEN_STRONG_INLINE long max_buffer_size() const { - return queue_stream()->max_buffer_size(); - } - EIGEN_STRONG_INLINE std::string getPlatformName() const { - return queue_stream()->getPlatformName(); - } - EIGEN_STRONG_INLINE std::string getDeviceName() const { - return queue_stream()->getDeviceName(); - } - EIGEN_STRONG_INLINE std::string getDeviceVendor() const { - return queue_stream()->getDeviceVendor(); - } - template <typename OutScalar, typename KernelType, typename... T> - EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const { - queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>( - var...); - } - template <typename OutScalar, typename KernelType, typename... T> - EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const { - queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>( - var...); - } - - template <typename OutScalar, typename KernelType, typename... T> - EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const { - queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>( - var...); - } -}; -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceThreadPool.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceThreadPool.h deleted file mode 100644 index e524b53..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ /dev/null @@ -1,409 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if defined(EIGEN_USE_THREADS) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H) -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H - -namespace Eigen { - -// Runs an arbitrary function and then calls Notify() on the passed in -// Notification. -template <typename Function, typename... Args> struct FunctionWrapperWithNotification -{ - static void run(Notification* n, Function f, Args... args) { - f(args...); - if (n) { - n->Notify(); - } - } -}; - -template <typename Function, typename... Args> struct FunctionWrapperWithBarrier -{ - static void run(Barrier* b, Function f, Args... args) { - f(args...); - if (b) { - b->Notify(); - } - } -}; - -template <typename SyncType> -static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) { - if (n) { - n->Wait(); - } -} - -// An abstract interface to a device specific memory allocator. -class Allocator { - public: - virtual ~Allocator() {} - virtual void* allocate(size_t num_bytes) const = 0; - virtual void deallocate(void* buffer) const = 0; -}; - -// Build a thread pool device on top the an existing pool of threads. -struct ThreadPoolDevice { - // The ownership of the thread pool remains with the caller. - ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores, Allocator* allocator = nullptr) - : pool_(pool), num_threads_(num_cores), allocator_(allocator) { } - - EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - return allocator_ ? allocator_->allocate(num_bytes) - : internal::aligned_malloc(num_bytes); - } - - EIGEN_STRONG_INLINE void deallocate(void* buffer) const { - if (allocator_) { - allocator_->deallocate(buffer); - } else { - internal::aligned_free(buffer); - } - } - - EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { - return allocate(num_bytes); - } - - EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { - deallocate(buffer); - } - - template<typename Type> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { - return data; - } - - EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { -#ifdef __ANDROID__ - ::memcpy(dst, src, n); -#else - // TODO(rmlarsen): Align blocks on cache lines. - // We have observed that going beyond 4 threads usually just wastes - // CPU cycles due to the threads competing for memory bandwidth, so we - // statically schedule at most 4 block copies here. - const size_t kMinBlockSize = 32768; - const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4); - if (n <= kMinBlockSize || num_threads < 2) { - ::memcpy(dst, src, n); - } else { - const char* src_ptr = static_cast<const char*>(src); - char* dst_ptr = static_cast<char*>(dst); - const size_t blocksize = (n + (num_threads - 1)) / num_threads; - Barrier barrier(static_cast<int>(num_threads - 1)); - // Launch the last 3 blocks on worker threads. - for (size_t i = 1; i < num_threads; ++i) { - enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] { - ::memcpy(dst_ptr + i * blocksize, src_ptr + i * blocksize, - numext::mini(blocksize, n - (i * blocksize))); - }); - } - // Launch the first block on the main thread. - ::memcpy(dst_ptr, src_ptr, blocksize); - barrier.Wait(); - } -#endif - } - EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { - memcpy(dst, src, n); - } - EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { - memcpy(dst, src, n); - } - - EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { - ::memset(buffer, c, n); - } - - EIGEN_STRONG_INLINE int numThreads() const { - return num_threads_; - } - - // Number of theads available in the underlying thread pool. This number can - // be different from the value returned by numThreads(). - EIGEN_STRONG_INLINE int numThreadsInPool() const { - return pool_->NumThreads(); - } - - EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { - return l1CacheSize(); - } - - EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { - // The l3 cache size is shared between all the cores. - return l3CacheSize() / num_threads_; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { - // Should return an enum that encodes the ISA supported by the CPU - return 1; - } - - template <class Function, class... Args> - EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, - Args&&... args) const { - Notification* n = new Notification(); - pool_->Schedule( - std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, - std::move(f), args...)); - return n; - } - - template <class Function, class... Args> - EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, Function&& f, - Args&&... args) const { - pool_->Schedule( - std::bind(&FunctionWrapperWithBarrier<Function, Args...>::run, b, - std::move(f), args...)); - } - - template <class Function, class... Args> - EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, - Args&&... args) const { - if (sizeof...(args) > 0) { - pool_->Schedule(std::bind(std::move(f), args...)); - } else { - pool_->Schedule(std::move(f)); - } - } - - // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if - // called from one of the threads in pool_. Returns -1 otherwise. - EIGEN_STRONG_INLINE int currentThreadId() const { - return pool_->CurrentThreadId(); - } - - // WARNING: This function is synchronous and will block the calling thread. - // - // Synchronous parallelFor executes f with [0, n) arguments in parallel and - // waits for completion. F accepts a half-open interval [first, last). Block - // size is chosen based on the iteration cost and resulting parallel - // efficiency. If block_align is not nullptr, it is called to round up the - // block size. - void parallelFor(Index n, const TensorOpCost& cost, - std::function<Index(Index)> block_align, - std::function<void(Index, Index)> f) const { - if (EIGEN_PREDICT_FALSE(n <= 0)){ - return; - // Compute small problems directly in the caller thread. - } else if (n == 1 || numThreads() == 1 || - CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) { - f(0, n); - return; - } - - // Compute block size and total count of blocks. - ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align); - - // Recursively divide size into halves until we reach block_size. - // Division code rounds mid to block_size, so we are guaranteed to get - // block_count leaves that do actual computations. - Barrier barrier(static_cast<unsigned int>(block.count)); - std::function<void(Index, Index)> handleRange; - handleRange = [=, &handleRange, &barrier, &f](Index firstIdx, - Index lastIdx) { - while (lastIdx - firstIdx > block.size) { - // Split into halves and schedule the second half on a different thread. - const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size; - pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); }); - lastIdx = midIdx; - } - // Single block or less, execute directly. - f(firstIdx, lastIdx); - barrier.Notify(); - }; - - if (block.count <= numThreads()) { - // Avoid a thread hop by running the root of the tree and one block on the - // main thread. - handleRange(0, n); - } else { - // Execute the root in the thread pool to avoid running work on more than - // numThreads() threads. - pool_->Schedule([=, &handleRange]() { handleRange(0, n); }); - } - - barrier.Wait(); - } - - // Convenience wrapper for parallelFor that does not align blocks. - void parallelFor(Index n, const TensorOpCost& cost, - std::function<void(Index, Index)> f) const { - parallelFor(n, cost, nullptr, std::move(f)); - } - - // WARNING: This function is asynchronous and will not block the calling thread. - // - // Asynchronous parallelFor executes f with [0, n) arguments in parallel - // without waiting for completion. When the last block finished, it will call - // 'done' callback. F accepts a half-open interval [first, last). Block size - // is chosen based on the iteration cost and resulting parallel efficiency. If - // block_align is not nullptr, it is called to round up the block size. - void parallelForAsync(Index n, const TensorOpCost& cost, - std::function<Index(Index)> block_align, - std::function<void(Index, Index)> f, - std::function<void()> done) const { - // Compute small problems directly in the caller thread. - if (n <= 1 || numThreads() == 1 || - CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) { - f(0, n); - done(); - return; - } - - // Compute block size and total count of blocks. - ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align); - - ParallelForAsyncContext* const ctx = - new ParallelForAsyncContext(block.count, std::move(f), std::move(done)); - - // Recursively divide size into halves until we reach block_size. - // Division code rounds mid to block_size, so we are guaranteed to get - // block_count leaves that do actual computations. - ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) { - while (lastIdx - firstIdx > block.size) { - // Split into halves and schedule the second half on a different thread. - const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size; - pool_->Schedule( - [ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); }); - lastIdx = midIdx; - } - - // Single block or less, execute directly. - ctx->f(firstIdx, lastIdx); - - // Delete async context if it was the last block. - if (ctx->count.fetch_sub(1) == 1) delete ctx; - }; - - if (block.count <= numThreads()) { - // Avoid a thread hop by running the root of the tree and one block on the - // main thread. - ctx->handle_range(0, n); - } else { - // Execute the root in the thread pool to avoid running work on more than - // numThreads() threads. - pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); }); - } - } - - // Convenience wrapper for parallelForAsync that does not align blocks. - void parallelForAsync(Index n, const TensorOpCost& cost, - std::function<void(Index, Index)> f, - std::function<void()> done) const { - parallelForAsync(n, cost, nullptr, std::move(f), std::move(done)); - } - - // Thread pool accessor. - ThreadPoolInterface* getPool() const { return pool_; } - - // Allocator accessor. - Allocator* allocator() const { return allocator_; } - - private: - typedef TensorCostModel<ThreadPoolDevice> CostModel; - - // For parallelForAsync we must keep passed in closures on the heap, and - // delete them only after `done` callback finished. - struct ParallelForAsyncContext { - ParallelForAsyncContext(Index block_count, - std::function<void(Index, Index)> block_f, - std::function<void()> done_callback) - : count(block_count), - f(std::move(block_f)), - done(std::move(done_callback)) {} - ~ParallelForAsyncContext() { done(); } - - std::atomic<Index> count; - std::function<void(Index, Index)> f; - std::function<void()> done; - - std::function<void(Index, Index)> handle_range; - }; - - struct ParallelForBlock { - Index size; // block size - Index count; // number of blocks - }; - - // Calculates block size based on (1) the iteration cost and (2) parallel - // efficiency. We want blocks to be not too small to mitigate parallelization - // overheads; not too large to mitigate tail effect and potential load - // imbalance and we also want number of blocks to be evenly dividable across - // threads. - ParallelForBlock CalculateParallelForBlock( - const Index n, const TensorOpCost& cost, - std::function<Index(Index)> block_align) const { - const double block_size_f = 1.0 / CostModel::taskSize(1, cost); - const Index max_oversharding_factor = 4; - Index block_size = numext::mini( - n, numext::maxi<Index>( - divup<Index>(n, max_oversharding_factor * numThreads()), - block_size_f)); - const Index max_block_size = numext::mini(n, 2 * block_size); - - if (block_align) { - Index new_block_size = block_align(block_size); - eigen_assert(new_block_size >= block_size); - block_size = numext::mini(n, new_block_size); - } - - Index block_count = divup(n, block_size); - - // Calculate parallel efficiency as fraction of total CPU time used for - // computations: - double max_efficiency = - static_cast<double>(block_count) / - (divup<int>(block_count, numThreads()) * numThreads()); - - // Now try to increase block size up to max_block_size as long as it - // doesn't decrease parallel efficiency. - for (Index prev_block_count = block_count; - max_efficiency < 1.0 && prev_block_count > 1;) { - // This is the next block size that divides size into a smaller number - // of blocks than the current block_size. - Index coarser_block_size = divup(n, prev_block_count - 1); - if (block_align) { - Index new_block_size = block_align(coarser_block_size); - eigen_assert(new_block_size >= coarser_block_size); - coarser_block_size = numext::mini(n, new_block_size); - } - if (coarser_block_size > max_block_size) { - break; // Reached max block size. Stop. - } - // Recalculate parallel efficiency. - const Index coarser_block_count = divup(n, coarser_block_size); - eigen_assert(coarser_block_count < prev_block_count); - prev_block_count = coarser_block_count; - const double coarser_efficiency = - static_cast<double>(coarser_block_count) / - (divup<int>(coarser_block_count, numThreads()) * numThreads()); - if (coarser_efficiency + 0.01 >= max_efficiency) { - // Taking it. - block_size = coarser_block_size; - block_count = coarser_block_count; - if (max_efficiency < coarser_efficiency) { - max_efficiency = coarser_efficiency; - } - } - } - - return {block_size, block_count}; - } - - ThreadPoolInterface* pool_; - int num_threads_; - Allocator* allocator_; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensionList.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensionList.h deleted file mode 100644 index 1a30e45..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensionList.h +++ /dev/null @@ -1,236 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H -#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H - -namespace Eigen { - -/** \internal - * - * \class TensorDimensionList - * \ingroup CXX11_Tensor_Module - * - * \brief Special case of tensor index list used to list all the dimensions of a tensor of rank n. - * - * \sa Tensor - */ - -template <typename Index, std::size_t Rank> struct DimensionList { - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - const Index operator[] (const Index i) const { return i; } -}; - -namespace internal { - -template<typename Index, std::size_t Rank> struct array_size<DimensionList<Index, Rank> > { - static const size_t value = Rank; -}; -template<typename Index, std::size_t Rank> struct array_size<const DimensionList<Index, Rank> > { - static const size_t value = Rank; -}; - -template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(DimensionList<Index, Rank>&) { - return n; -} -template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(const DimensionList<Index, Rank>&) { - return n; -} - - -#if EIGEN_HAS_CONSTEXPR -template <typename Index, std::size_t Rank> -struct index_known_statically_impl<DimensionList<Index, Rank> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { - return true; - } -}; -template <typename Index, std::size_t Rank> -struct index_known_statically_impl<const DimensionList<Index, Rank> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { - return true; - } -}; - -template <typename Index, std::size_t Rank> -struct all_indices_known_statically_impl<DimensionList<Index, Rank> > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return true; - } -}; -template <typename Index, std::size_t Rank> -struct all_indices_known_statically_impl<const DimensionList<Index, Rank> > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return true; - } -}; - -template <typename Index, std::size_t Rank> -struct indices_statically_known_to_increase_impl<DimensionList<Index, Rank> > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return true; - } -}; -template <typename Index, std::size_t Rank> -struct indices_statically_known_to_increase_impl<const DimensionList<Index, Rank> > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return true; - } -}; - -template <typename Index, std::size_t Rank> -struct index_statically_eq_impl<DimensionList<Index, Rank> > { - static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i == value; - } -}; -template <typename Index, std::size_t Rank> -struct index_statically_eq_impl<const DimensionList<Index, Rank> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i == value; - } -}; - -template <typename Index, std::size_t Rank> -struct index_statically_ne_impl<DimensionList<Index, Rank> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i != value; - } -}; -template <typename Index, std::size_t Rank> -struct index_statically_ne_impl<const DimensionList<Index, Rank> > { - static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i != value; - } -}; - -template <typename Index, std::size_t Rank> -struct index_statically_gt_impl<DimensionList<Index, Rank> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i > value; - } -}; -template <typename Index, std::size_t Rank> -struct index_statically_gt_impl<const DimensionList<Index, Rank> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i > value; - } -}; - -template <typename Index, std::size_t Rank> -struct index_statically_lt_impl<DimensionList<Index, Rank> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i < value; - } -}; -template <typename Index, std::size_t Rank> -struct index_statically_lt_impl<const DimensionList<Index, Rank> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i < value; - } -}; - -#else -template <typename Index, std::size_t Rank> -struct index_known_statically_impl<DimensionList<Index, Rank> > { - EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) { - return true; - } -}; -template <typename Index, std::size_t Rank> -struct index_known_statically_impl<const DimensionList<Index, Rank> > { - EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) { - return true; - } -}; - -template <typename Index, std::size_t Rank> -struct all_indices_known_statically_impl<DimensionList<Index, Rank> > { - EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() { - return true; - } -}; -template <typename Index, std::size_t Rank> -struct all_indices_known_statically_impl<const DimensionList<Index, Rank> > { - EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() { - return true; - } -}; - -template <typename Index, std::size_t Rank> -struct indices_statically_known_to_increase_impl<DimensionList<Index, Rank> > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { - return true; - } -}; -template <typename Index, std::size_t Rank> -struct indices_statically_known_to_increase_impl<const DimensionList<Index, Rank> > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { - return true; - } -}; - -template <typename Index, std::size_t Rank> -struct index_statically_eq_impl<DimensionList<Index, Rank> > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { - return false; - } -}; -template <typename Index, std::size_t Rank> -struct index_statically_eq_impl<const DimensionList<Index, Rank> > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { - return false; - } -}; - -template <typename Index, std::size_t Rank> -struct index_statically_ne_impl<DimensionList<Index, Rank> > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex){ - return false; - } -}; -template <typename Index, std::size_t Rank> -struct index_statically_ne_impl<const DimensionList<Index, Rank> > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { - return false; - } -}; - -template <typename Index, std::size_t Rank> -struct index_statically_gt_impl<DimensionList<Index, Rank> > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { - return false; - } -}; -template <typename Index, std::size_t Rank> -struct index_statically_gt_impl<const DimensionList<Index, Rank> > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { - return false; - } -}; - -template <typename Index, std::size_t Rank> -struct index_statically_lt_impl<DimensionList<Index, Rank> > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { - return false; - } -}; -template <typename Index, std::size_t Rank> -struct index_statically_lt_impl<const DimensionList<Index, Rank> > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { - return false; - } -}; -#endif - -} // end namespace internal -} // end namespace Eigen - - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensions.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensions.h deleted file mode 100644 index f0f1e83..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensions.h +++ /dev/null @@ -1,490 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H -#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H - - -namespace Eigen { - -/** \internal - * - * \class TensorDimensions - * \ingroup CXX11_Tensor_Module - * - * \brief Set of classes used to encode and store the dimensions of a Tensor. - * - * The Sizes class encodes as part of the type the number of dimensions and the - * sizes corresponding to each dimension. It uses no storage space since it is - * entirely known at compile time. - * The DSizes class is its dynamic sibling: the number of dimensions is known - * at compile time but the sizes are set during execution. - * - * \sa Tensor - */ - -// Boilerplate code -namespace internal { - -template<std::ptrdiff_t n, typename Dimension> struct dget { - static const std::ptrdiff_t value = get<n, Dimension>::value; -}; - - -template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor> -struct fixed_size_tensor_index_linearization_helper -{ - template <typename Dimensions> EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const& indices, - const Dimensions& dimensions) - { - return array_get<RowMajor ? n - 1 : (NumIndices - n)>(indices) + - dget<RowMajor ? n - 1 : (NumIndices - n), Dimensions>::value * - fixed_size_tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions); - } -}; - -template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor> -struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor> -{ - template <typename Dimensions> EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const&, const Dimensions&) - { - return 0; - } -}; - -template<typename Index, std::ptrdiff_t n> -struct fixed_size_tensor_index_extraction_helper -{ - template <typename Dimensions> EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Index run(const Index index, - const Dimensions& dimensions) - { - const Index mult = (index == n-1) ? 1 : 0; - return array_get<n-1>(dimensions) * mult + - fixed_size_tensor_index_extraction_helper<Index, n - 1>::run(index, dimensions); - } -}; - -template<typename Index> -struct fixed_size_tensor_index_extraction_helper<Index, 0> -{ - template <typename Dimensions> EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Index run(const Index, - const Dimensions&) - { - return 0; - } - }; - -} // end namespace internal - - -// Fixed size -#ifndef EIGEN_EMULATE_CXX11_META_H -template <typename std::ptrdiff_t... Indices> -struct Sizes { - typedef internal::numeric_list<std::ptrdiff_t, Indices...> Base; - const Base t = Base(); - static const std::ptrdiff_t total_size = internal::arg_prod(Indices...); - static const ptrdiff_t count = Base::count; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const { - return Base::count; - } - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t TotalSize() { - return internal::arg_prod(Indices...); - } - - EIGEN_DEVICE_FUNC Sizes() { } - template <typename DenseIndex> - explicit EIGEN_DEVICE_FUNC Sizes(const array<DenseIndex, Base::count>& /*indices*/) { - // todo: add assertion - } -#if EIGEN_HAS_VARIADIC_TEMPLATES - template <typename... DenseIndex> EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { } - explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) { - // todo: add assertion - } -#endif - - template <typename T> Sizes& operator = (const T& /*other*/) { - // add assertion failure if the size of other is different - return *this; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::ptrdiff_t index) const { - return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, t); - } - - template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - ptrdiff_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const { - return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, t); - } - template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - ptrdiff_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const { - return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, t); - } -}; - -namespace internal { -template <typename std::ptrdiff_t... Indices> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<Indices...>&) { - return Sizes<Indices...>::total_size; -} -} - -#else - -template <std::ptrdiff_t n> -struct non_zero_size { - typedef internal::type2val<std::ptrdiff_t, n> type; -}; -template <> -struct non_zero_size<0> { - typedef internal::null_type type; -}; - -template <std::ptrdiff_t V1=0, std::ptrdiff_t V2=0, std::ptrdiff_t V3=0, std::ptrdiff_t V4=0, std::ptrdiff_t V5=0> struct Sizes { - typedef typename internal::make_type_list<typename non_zero_size<V1>::type, typename non_zero_size<V2>::type, typename non_zero_size<V3>::type, typename non_zero_size<V4>::type, typename non_zero_size<V5>::type >::type Base; - static const std::ptrdiff_t count = Base::count; - static const std::ptrdiff_t total_size = internal::arg_prod<Base>::value; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t rank() const { - return count; - } - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t TotalSize() { - return internal::arg_prod<Base>::value; - } - - Sizes() { } - template <typename DenseIndex> - explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) { - // todo: add assertion - } - template <typename T> Sizes& operator = (const T& /*other*/) { - // add assertion failure if the size of other is different - return *this; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template <typename... DenseIndex> Sizes(DenseIndex... /*indices*/) { } - explicit Sizes(std::initializer_list<std::ptrdiff_t>) { - // todo: add assertion - } -#else - EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) { - } - EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex) { - } - EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex) { - } - EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) { - } - EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) { - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index operator[] (const Index index) const { - switch (index) { - case 0: - return internal::get<0, Base>::value; - case 1: - return internal::get<1, Base>::value; - case 2: - return internal::get<2, Base>::value; - case 3: - return internal::get<3, Base>::value; - case 4: - return internal::get<4, Base>::value; - default: - eigen_assert(false && "index overflow"); - return static_cast<Index>(-1); - } - } - - template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - ptrdiff_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const { - return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *reinterpret_cast<const Base*>(this)); - } - template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - ptrdiff_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const { - return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *reinterpret_cast<const Base*>(this)); - } -}; - -namespace internal { -template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) { - return Sizes<V1, V2, V3, V4, V5>::total_size; -} -} - -#endif - -// Boilerplate -namespace internal { -template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor> -struct tensor_index_linearization_helper -{ - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const& dimensions) - { - return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) + - array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) * - tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions); - } -}; - -template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor> -struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor> -{ - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const&) - { - return array_get<RowMajor ? 0 : NumIndices - 1>(indices); - } -}; -} // end namespace internal - - - -// Dynamic size -template <typename DenseIndex, int NumDims> -struct DSizes : array<DenseIndex, NumDims> { - typedef array<DenseIndex, NumDims> Base; - static const int count = NumDims; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { - return NumDims; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const { - return (NumDims == 0) ? 1 : internal::array_prod(*static_cast<const Base*>(this)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DSizes() { - for (int i = 0 ; i < NumDims; ++i) { - (*this)[i] = 0; - } - } - EIGEN_DEVICE_FUNC explicit DSizes(const array<DenseIndex, NumDims>& a) : Base(a) { } - - EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) { - eigen_assert(NumDims == 1); - (*this)[0] = i0; - } - - EIGEN_DEVICE_FUNC DSizes(const DimensionList<DenseIndex, NumDims>& a) { - for (int i = 0 ; i < NumDims; ++i) { - (*this)[i] = a[i]; - } - } - - // Enable DSizes index type promotion only if we are promoting to the - // larger type, e.g. allow to promote dimensions of type int to long. - template<typename OtherIndex> - EIGEN_DEVICE_FUNC - explicit DSizes(const array<OtherIndex, NumDims>& other, - // Default template parameters require c++11. - typename internal::enable_if< - internal::is_same< - DenseIndex, - typename internal::promote_index_type< - DenseIndex, - OtherIndex - >::type - >::value, void*>::type = 0) { - for (int i = 0; i < NumDims; ++i) { - (*this)[i] = static_cast<DenseIndex>(other[i]); - } - } - -#ifdef EIGEN_HAS_INDEX_LIST - template <typename FirstType, typename... OtherTypes> - EIGEN_DEVICE_FUNC - explicit DSizes(const Eigen::IndexList<FirstType, OtherTypes...>& dimensions) { - for (int i = 0; i < dimensions.count; ++i) { - (*this)[i] = dimensions[i]; - } - } -#endif - -#ifndef EIGEN_EMULATE_CXX11_META_H - template <typename std::ptrdiff_t... Indices> - EIGEN_DEVICE_FUNC DSizes(const Sizes<Indices...>& a) { - for (int i = 0 ; i < NumDims; ++i) { - (*this)[i] = a[i]; - } - } -#else - template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> - EIGEN_DEVICE_FUNC DSizes(const Sizes<V1, V2, V3, V4, V5>& a) { - for (int i = 0 ; i < NumDims; ++i) { - (*this)[i] = a[i]; - } - } -#endif - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template<typename... IndexTypes> EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) { - EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) - } -#else - EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1) { - eigen_assert(NumDims == 2); - (*this)[0] = i0; - (*this)[1] = i1; - } - EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { - eigen_assert(NumDims == 3); - (*this)[0] = i0; - (*this)[1] = i1; - (*this)[2] = i2; - } - EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { - eigen_assert(NumDims == 4); - (*this)[0] = i0; - (*this)[1] = i1; - (*this)[2] = i2; - (*this)[3] = i3; - } - EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { - eigen_assert(NumDims == 5); - (*this)[0] = i0; - (*this)[1] = i1; - (*this)[2] = i2; - (*this)[3] = i3; - (*this)[4] = i4; - } -#endif - - EIGEN_DEVICE_FUNC DSizes& operator = (const array<DenseIndex, NumDims>& other) { - *static_cast<Base*>(this) = other; - return *this; - } - - // A constexpr would be so much better here - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfColMajor(const array<DenseIndex, NumDims>& indices) const { - return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, false>::run(indices, *static_cast<const Base*>(this)); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const { - return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, true>::run(indices, *static_cast<const Base*>(this)); - } -}; - -template <typename IndexType, int NumDims> -std::ostream& operator<<(std::ostream& os, - const DSizes<IndexType, NumDims>& dims) { - os << "["; - for (int i = 0; i < NumDims; ++i) { - if (i > 0) os << ", "; - os << dims[i]; - } - os << "]"; - return os; -} - -// Boilerplate -namespace internal { -template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor> -struct tensor_vsize_index_linearization_helper -{ - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const& dimensions) - { - return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) + - array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) * - tensor_vsize_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions); - } -}; - -template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor> -struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor> -{ - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const&) - { - return array_get<RowMajor ? 0 : NumIndices - 1>(indices); - } -}; -} // end namespace internal - - -namespace internal { - -template <typename DenseIndex, int NumDims> struct array_size<const DSizes<DenseIndex, NumDims> > { - static const ptrdiff_t value = NumDims; -}; -template <typename DenseIndex, int NumDims> struct array_size<DSizes<DenseIndex, NumDims> > { - static const ptrdiff_t value = NumDims; -}; -#ifndef EIGEN_EMULATE_CXX11_META_H -template <typename std::ptrdiff_t... Indices> struct array_size<const Sizes<Indices...> > { -static const std::ptrdiff_t value = Sizes<Indices...>::count; -}; -template <typename std::ptrdiff_t... Indices> struct array_size<Sizes<Indices...> > { -static const std::ptrdiff_t value = Sizes<Indices...>::count; -}; -template <std::ptrdiff_t n, typename std::ptrdiff_t... Indices> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<Indices...>&) { - return get<n, internal::numeric_list<std::ptrdiff_t, Indices...> >::value; -} -template <std::ptrdiff_t n> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) { - eigen_assert(false && "should never be called"); - return -1; -} -#else -template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > { - static const ptrdiff_t value = Sizes<V1,V2,V3,V4,V5>::count; -}; -template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > { - static const ptrdiff_t value = Sizes<V1,V2,V3,V4,V5>::count; -}; -template <std::ptrdiff_t n, std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<V1,V2,V3,V4,V5>&) { - return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value; -} - -#endif - - -template <typename Dims1, typename Dims2, ptrdiff_t n, ptrdiff_t m> -struct sizes_match_below_dim { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) { - return false; - } -}; -template <typename Dims1, typename Dims2, ptrdiff_t n> -struct sizes_match_below_dim<Dims1, Dims2, n, n> { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1& dims1, Dims2& dims2) { - return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) && - sizes_match_below_dim<Dims1, Dims2, n-1, n-1>::run(dims1, dims2); - } -}; -template <typename Dims1, typename Dims2> -struct sizes_match_below_dim<Dims1, Dims2, 0, 0> { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) { - return true; - } -}; - -} // end namespace internal - - -template <typename Dims1, typename Dims2> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2) { - return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value, internal::array_size<Dims2>::value>::run(dims1, dims2); -} - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorEvalTo.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorEvalTo.h deleted file mode 100644 index a48d035..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorEvalTo.h +++ /dev/null @@ -1,236 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H -#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H - -namespace Eigen { - -/** \class TensorForcedEval - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor reshaping class. - * - * - */ -namespace internal { -template<typename XprType, template <class> class MakePointer_> -struct traits<TensorEvalToOp<XprType, MakePointer_> > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename XprType::Scalar Scalar; - typedef traits<XprType> XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - typedef typename MakePointer_<Scalar>::Type PointerType; - - enum { - Flags = 0 - }; - template <class T> - struct MakePointer { - // Intermediate typedef to workaround MSVC issue. - typedef MakePointer_<T> MakePointerT; - typedef typename MakePointerT::Type Type; - - - }; -}; - -template<typename XprType, template <class> class MakePointer_> -struct eval<TensorEvalToOp<XprType, MakePointer_>, Eigen::Dense> -{ - typedef const TensorEvalToOp<XprType, MakePointer_>& type; -}; - -template<typename XprType, template <class> class MakePointer_> -struct nested<TensorEvalToOp<XprType, MakePointer_>, 1, typename eval<TensorEvalToOp<XprType, MakePointer_> >::type> -{ - typedef TensorEvalToOp<XprType, MakePointer_> type; -}; - -} // end namespace internal - - - - -template<typename XprType, template <class> class MakePointer_> -class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType, MakePointer_>, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; - typedef typename MakePointer_<CoeffReturnType>::Type PointerType; - typedef typename Eigen::internal::nested<TensorEvalToOp>::type Nested; - typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index; - - static const int NumDims = Eigen::internal::traits<TensorEvalToOp>::NumDimensions; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr) - : m_xpr(expr), m_buffer(buffer) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC PointerType buffer() const { return m_buffer; } - - protected: - typename XprType::Nested m_xpr; - PointerType m_buffer; -}; - - - -template<typename ArgType, typename Device, template <class> class MakePointer_> -struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device> -{ - typedef TensorEvalToOp<ArgType, MakePointer_> XprType; - typedef typename ArgType::Scalar Scalar; - typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; - typedef typename XprType::Index Index; - typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - enum { - IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = true, - PreferBlockAccess = false, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = true - }; - - static const int NumDims = internal::traits<ArgType>::NumDimensions; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; - - typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock - ArgTensorBlock; - - typedef internal::TensorBlockAssignment< - CoeffReturnType, NumDims, typename ArgTensorBlock::XprType, Index> - TensorBlockAssignment; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){} - - - EIGEN_STRONG_INLINE ~TensorEvaluator() { - } - - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) { - EIGEN_UNUSED_VARIABLE(scalar); - eigen_assert(scalar == NULL); - return m_impl.evalSubExprsIfNeeded(m_buffer); - } - -#ifdef EIGEN_USE_THREADS - template <typename EvalSubExprsCallback> - EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType scalar, EvalSubExprsCallback done) { - EIGEN_UNUSED_VARIABLE(scalar); - eigen_assert(scalar == NULL); - m_impl.evalSubExprsIfNeededAsync(m_buffer, std::move(done)); - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { - m_buffer[i] = m_impl.coeff(i); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { - internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - return m_impl.getResourceRequirements(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock( - TensorBlockDesc& desc, TensorBlockScratch& scratch) { - // Add `m_buffer` as destination buffer to the block descriptor. - desc.template AddDestinationBuffer<Layout>( - /*dst_base=*/m_buffer + desc.offset(), - /*dst_strides=*/internal::strides<Layout>(m_impl.dimensions())); - - ArgTensorBlock block = - m_impl.block(desc, scratch, /*root_of_expr_ast=*/true); - - // If block was evaluated into a destination buffer, there is no need to do - // an assignment. - if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) { - TensorBlockAssignment::Run( - TensorBlockAssignment::target( - desc.dimensions(), internal::strides<Layout>(m_impl.dimensions()), - m_buffer, desc.offset()), - block.expr()); - } - block.cleanup(); - } - - EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_buffer[index]; - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - // We assume that evalPacket or evalScalar is called to perform the - // assignment and account for the cost of the write here. - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; } - ArgType expression() const { return m_expression; } - #ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - m_buffer.bind(cgh); - } - #endif - - - private: - TensorEvaluator<ArgType, Device> m_impl; - EvaluatorPointerType m_buffer; - const ArgType m_expression; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorEvaluator.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorEvaluator.h deleted file mode 100644 index 3aff7fa..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorEvaluator.h +++ /dev/null @@ -1,983 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H -#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H - -namespace Eigen { - -/** \class TensorEvaluator - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor evaluator classes. - * - * These classes are responsible for the evaluation of the tensor expression. - * - * TODO: add support for more types of expressions, in particular expressions - * leading to lvalues (slicing, reshaping, etc...) - */ - -// Generic evaluator -template<typename Derived, typename Device> -struct TensorEvaluator -{ - typedef typename Derived::Index Index; - typedef typename Derived::Scalar Scalar; - typedef typename Derived::Scalar CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef typename Derived::Dimensions Dimensions; - typedef Derived XprType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef typename internal::traits<Derived>::template MakePointer<Scalar>::Type TensorPointerType; - typedef StorageMemory<Scalar, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - // NumDimensions is -1 for variable dim tensors - static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ? - internal::traits<Derived>::NumDimensions : 0; - - enum { - IsAligned = Derived::IsAligned, - PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), - BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value, - PreferBlockAccess = false, - Layout = Derived::Layout, - CoordAccess = NumCoords > 0, - RawAccess = true - }; - - typedef typename internal::remove_const<Scalar>::type ScalarNoConst; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; - - typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords, - Layout, Index> - TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(device.get((const_cast<TensorPointerType>(m.data())))), - m_dims(m.dimensions()), - m_device(device) - { } - - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) { - if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && dest) { - m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); - return false; - } - return true; - } - -#ifdef EIGEN_USE_THREADS - template <typename EvalSubExprsCallback> - EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType dest, EvalSubExprsCallback done) { - // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation. - done(evalSubExprsIfNeeded(dest)); - } -#endif // EIGEN_USE_THREADS - - EIGEN_STRONG_INLINE void cleanup() {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_data != NULL); - return m_data[index]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { - eigen_assert(m_data != NULL); - return m_data[index]; - } - - template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketReturnType packet(Index index) const - { - return internal::ploadt<PacketReturnType, LoadMode>(m_data + index); - } - - // Return a packet starting at `index` where `umask` specifies which elements - // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for - // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding - // float element will be loaded, otherwise 0 will be loaded. - // Function has been templatized to enable Sfinae. - template <typename PacketReturnTypeT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::enable_if<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>::type - partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const - { - return internal::ploadu<PacketReturnTypeT>(m_data + index, umask); - } - - template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - return internal::pstoret<Scalar, PacketReturnType, StoreMode>(m_data + index, x); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const { - eigen_assert(m_data != NULL); - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - return m_data[m_dims.IndexOfColMajor(coords)]; - } else { - return m_data[m_dims.IndexOfRowMajor(coords)]; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& - coeffRef(const array<DenseIndex, NumCoords>& coords) { - eigen_assert(m_data != NULL); - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - return m_data[m_dims.IndexOfColMajor(coords)]; - } else { - return m_data[m_dims.IndexOfRowMajor(coords)]; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - PacketType<CoeffReturnType, Device>::size); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - return internal::TensorBlockResourceRequirements::any(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - assert(m_data != NULL); - return TensorBlock::materialize(m_data, m_dims, desc, scratch); - } - - template<typename TensorBlock> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( - const TensorBlockDesc& desc, const TensorBlock& block) { - assert(m_data != NULL); - - typedef typename TensorBlock::XprType TensorBlockExpr; - typedef internal::TensorBlockAssignment<Scalar, NumCoords, TensorBlockExpr, - Index> - TensorBlockAssign; - - TensorBlockAssign::Run( - TensorBlockAssign::target(desc.dimensions(), - internal::strides<Layout>(m_dims), m_data, - desc.offset()), - block.expr()); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_data.bind(cgh); - } -#endif - protected: - EvaluatorPointerType m_data; - Dimensions m_dims; - const Device EIGEN_DEVICE_REF m_device; -}; - -namespace { -template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T loadConstant(const T* address) { - return *address; -} -// Use the texture cache on CUDA devices whenever possible -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 -template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -float loadConstant(const float* address) { - return __ldg(address); -} -template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -double loadConstant(const double* address) { - return __ldg(address); -} -template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -Eigen::half loadConstant(const Eigen::half* address) { - return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x))); -} -#endif -#ifdef EIGEN_USE_SYCL -// overload of load constant should be implemented here based on range access -template <cl::sycl::access::mode AcMd, typename T> -T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess<AcMd, T> &address) { - return *address; -} -#endif -} - - -// Default evaluator for rvalues -template<typename Derived, typename Device> -struct TensorEvaluator<const Derived, Device> -{ - typedef typename Derived::Index Index; - typedef typename Derived::Scalar Scalar; - typedef typename Derived::Scalar CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef typename Derived::Dimensions Dimensions; - typedef const Derived XprType; - typedef typename internal::traits<Derived>::template MakePointer<const Scalar>::Type TensorPointerType; - typedef StorageMemory<const Scalar, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - typedef typename internal::remove_const<Scalar>::type ScalarNoConst; - - // NumDimensions is -1 for variable dim tensors - static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ? - internal::traits<Derived>::NumDimensions : 0; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - - enum { - IsAligned = Derived::IsAligned, - PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), - BlockAccess = internal::is_arithmetic<ScalarNoConst>::value, - PreferBlockAccess = false, - Layout = Derived::Layout, - CoordAccess = NumCoords > 0, - RawAccess = true - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; - - typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords, - Layout, Index> - TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device) - { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { - if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data) { - m_device.memcpy((void*)(m_device.get(data)),m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); - return false; - } - return true; - } - -#ifdef EIGEN_USE_THREADS - template <typename EvalSubExprsCallback> - EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType dest, EvalSubExprsCallback done) { - // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation. - done(evalSubExprsIfNeeded(dest)); - } -#endif // EIGEN_USE_THREADS - - EIGEN_STRONG_INLINE void cleanup() { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_data != NULL); - return loadConstant(m_data+index); - } - - template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketReturnType packet(Index index) const - { - return internal::ploadt_ro<PacketReturnType, LoadMode>(m_data + index); - } - - // Return a packet starting at `index` where `umask` specifies which elements - // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for - // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding - // float element will be loaded, otherwise 0 will be loaded. - // Function has been templatized to enable Sfinae. - template <typename PacketReturnTypeT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::enable_if<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>::type - partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const - { - return internal::ploadu<PacketReturnTypeT>(m_data + index, umask); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const { - eigen_assert(m_data != NULL); - const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords) - : m_dims.IndexOfRowMajor(coords); - return loadConstant(m_data+index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - PacketType<CoeffReturnType, Device>::size); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - return internal::TensorBlockResourceRequirements::any(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - assert(m_data != NULL); - return TensorBlock::materialize(m_data, m_dims, desc, scratch); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_data.bind(cgh); - } -#endif - protected: - EvaluatorPointerType m_data; - Dimensions m_dims; - const Device EIGEN_DEVICE_REF m_device; -}; - - - - -// -------------------- CwiseNullaryOp -------------------- - -template<typename NullaryOp, typename ArgType, typename Device> -struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> -{ - typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType; - - TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() - { } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::traits<XprType>::Scalar CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - IsAligned = true, - PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess - #ifdef EIGEN_USE_SYCL - && (PacketType<CoeffReturnType, Device>::size >1) - #endif - , - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } - -#ifdef EIGEN_USE_THREADS - template <typename EvalSubExprsCallback> - EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType, EvalSubExprsCallback done) { - done(true); - } -#endif // EIGEN_USE_THREADS - - EIGEN_STRONG_INLINE void cleanup() { } - - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_wrapper(m_functor, index); - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_wrapper.template packetOp<PacketReturnType, Index>(m_functor, index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - PacketType<CoeffReturnType, Device>::size); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_argImpl.bind(cgh); - } -#endif - - private: - const NullaryOp m_functor; - TensorEvaluator<ArgType, Device> m_argImpl; - const internal::nullary_wrapper<CoeffReturnType,NullaryOp> m_wrapper; -}; - - - -// -------------------- CwiseUnaryOp -------------------- - -template<typename UnaryOp, typename ArgType, typename Device> -struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device> -{ - typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType; - - enum { - IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, - PacketAccess = int(TensorEvaluator<ArgType, Device>::PacketAccess) & - int(internal::functor_traits<UnaryOp>::PacketAccess), - BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess, - PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - TensorEvaluator(const XprType& op, const Device& device) - : m_device(device), - m_functor(op.functor()), - m_argImpl(op.nestedExpression(), device) - { } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::remove_const<Scalar>::type ScalarNoConst; - typedef typename internal::traits<XprType>::Scalar CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - static const int NumDims = internal::array_size<Dimensions>::value; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; - - typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock - ArgTensorBlock; - - typedef internal::TensorCwiseUnaryBlock<UnaryOp, ArgTensorBlock> - TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { - m_argImpl.evalSubExprsIfNeeded(NULL); - return true; - } - -#ifdef EIGEN_USE_THREADS - template <typename EvalSubExprsCallback> - EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType, EvalSubExprsCallback done) { - m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); - } -#endif // EIGEN_USE_THREADS - - EIGEN_STRONG_INLINE void cleanup() { - m_argImpl.cleanup(); - } - - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_functor(m_argImpl.coeff(index)); - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - const double functor_cost = internal::functor_traits<UnaryOp>::Cost; - return m_argImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - static const double functor_cost = internal::functor_traits<UnaryOp>::Cost; - return m_argImpl.getResourceRequirements().addCostPerCoeff( - {0, 0, functor_cost / PacketSize}); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - return TensorBlock(m_argImpl.block(desc, scratch), m_functor); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const{ - m_argImpl.bind(cgh); - } -#endif - - - private: - const Device EIGEN_DEVICE_REF m_device; - const UnaryOp m_functor; - TensorEvaluator<ArgType, Device> m_argImpl; -}; - - -// -------------------- CwiseBinaryOp -------------------- - -template<typename BinaryOp, typename LeftArgType, typename RightArgType, typename Device> -struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType>, Device> -{ - typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType; - - enum { - IsAligned = int(TensorEvaluator<LeftArgType, Device>::IsAligned) & - int(TensorEvaluator<RightArgType, Device>::IsAligned), - PacketAccess = int(TensorEvaluator<LeftArgType, Device>::PacketAccess) & - int(TensorEvaluator<RightArgType, Device>::PacketAccess) & - int(internal::functor_traits<BinaryOp>::PacketAccess), - BlockAccess = int(TensorEvaluator<LeftArgType, Device>::BlockAccess) & - int(TensorEvaluator<RightArgType, Device>::BlockAccess), - PreferBlockAccess = int(TensorEvaluator<LeftArgType, Device>::PreferBlockAccess) | - int(TensorEvaluator<RightArgType, Device>::PreferBlockAccess), - Layout = TensorEvaluator<LeftArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - TensorEvaluator(const XprType& op, const Device& device) - : m_device(device), - m_functor(op.functor()), - m_leftImpl(op.lhsExpression(), device), - m_rightImpl(op.rhsExpression(), device) - { - EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); - eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); - } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::traits<XprType>::Scalar CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - static const int NumDims = internal::array_size< - typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; - - typedef typename TensorEvaluator<const LeftArgType, Device>::TensorBlock - LeftTensorBlock; - typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock - RightTensorBlock; - - typedef internal::TensorCwiseBinaryBlock<BinaryOp, LeftTensorBlock, - RightTensorBlock> - TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const - { - // TODO: use right impl instead if right impl dimensions are known at compile time. - return m_leftImpl.dimensions(); - } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { - m_leftImpl.evalSubExprsIfNeeded(NULL); - m_rightImpl.evalSubExprsIfNeeded(NULL); - return true; - } - -#ifdef EIGEN_USE_THREADS - template <typename EvalSubExprsCallback> - EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType, EvalSubExprsCallback done) { - // TODO(ezhulenev): Evaluate two expression in parallel? - m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) { - m_rightImpl.evalSubExprsIfNeededAsync(nullptr, - [done](bool) { done(true); }); - }); - } -#endif // EIGEN_USE_THREADS - - EIGEN_STRONG_INLINE void cleanup() { - m_leftImpl.cleanup(); - m_rightImpl.cleanup(); - } - - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index)); - } - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double functor_cost = internal::functor_traits<BinaryOp>::Cost; - return m_leftImpl.costPerCoeff(vectorized) + - m_rightImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - static const double functor_cost = internal::functor_traits<BinaryOp>::Cost; - return internal::TensorBlockResourceRequirements::merge( - m_leftImpl.getResourceRequirements(), - m_rightImpl.getResourceRequirements()) - .addCostPerCoeff({0, 0, functor_cost / PacketSize}); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - desc.DropDestinationBuffer(); - return TensorBlock(m_leftImpl.block(desc, scratch), - m_rightImpl.block(desc, scratch), m_functor); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - - #ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_leftImpl.bind(cgh); - m_rightImpl.bind(cgh); - } - #endif - private: - const Device EIGEN_DEVICE_REF m_device; - const BinaryOp m_functor; - TensorEvaluator<LeftArgType, Device> m_leftImpl; - TensorEvaluator<RightArgType, Device> m_rightImpl; -}; - -// -------------------- CwiseTernaryOp -------------------- - -template<typename TernaryOp, typename Arg1Type, typename Arg2Type, typename Arg3Type, typename Device> -struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type>, Device> -{ - typedef TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type> XprType; - - enum { - IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned, - PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess && - TensorEvaluator<Arg2Type, Device>::PacketAccess && - TensorEvaluator<Arg3Type, Device>::PacketAccess && - internal::functor_traits<TernaryOp>::PacketAccess, - BlockAccess = false, - PreferBlockAccess = TensorEvaluator<Arg1Type, Device>::PreferBlockAccess || - TensorEvaluator<Arg2Type, Device>::PreferBlockAccess || - TensorEvaluator<Arg3Type, Device>::PreferBlockAccess, - Layout = TensorEvaluator<Arg1Type, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), - m_arg1Impl(op.arg1Expression(), device), - m_arg2Impl(op.arg2Expression(), device), - m_arg3Impl(op.arg3Expression(), device) - { - EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<Arg1Type, Device>::Layout) == static_cast<int>(TensorEvaluator<Arg3Type, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); - - EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind, - typename internal::traits<Arg2Type>::StorageKind>::value), - STORAGE_KIND_MUST_MATCH) - EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind, - typename internal::traits<Arg3Type>::StorageKind>::value), - STORAGE_KIND_MUST_MATCH) - EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index, - typename internal::traits<Arg2Type>::Index>::value), - STORAGE_INDEX_MUST_MATCH) - EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index, - typename internal::traits<Arg3Type>::Index>::value), - STORAGE_INDEX_MUST_MATCH) - - eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions())); - } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::traits<XprType>::Scalar CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const - { - // TODO: use arg2 or arg3 dimensions if they are known at compile time. - return m_arg1Impl.dimensions(); - } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { - m_arg1Impl.evalSubExprsIfNeeded(NULL); - m_arg2Impl.evalSubExprsIfNeeded(NULL); - m_arg3Impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_STRONG_INLINE void cleanup() { - m_arg1Impl.cleanup(); - m_arg2Impl.cleanup(); - m_arg3Impl.cleanup(); - } - - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index)); - } - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_functor.packetOp(m_arg1Impl.template packet<LoadMode>(index), - m_arg2Impl.template packet<LoadMode>(index), - m_arg3Impl.template packet<LoadMode>(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double functor_cost = internal::functor_traits<TernaryOp>::Cost; - return m_arg1Impl.costPerCoeff(vectorized) + - m_arg2Impl.costPerCoeff(vectorized) + - m_arg3Impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_arg1Impl.bind(cgh); - m_arg2Impl.bind(cgh); - m_arg3Impl.bind(cgh); - } -#endif - - private: - const TernaryOp m_functor; - TensorEvaluator<Arg1Type, Device> m_arg1Impl; - TensorEvaluator<Arg2Type, Device> m_arg2Impl; - TensorEvaluator<Arg3Type, Device> m_arg3Impl; -}; - - -// -------------------- SelectOp -------------------- - -template<typename IfArgType, typename ThenArgType, typename ElseArgType, typename Device> -struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device> -{ - typedef TensorSelectOp<IfArgType, ThenArgType, ElseArgType> XprType; - typedef typename XprType::Scalar Scalar; - - enum { - IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & - TensorEvaluator<ElseArgType, Device>::IsAligned, - PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & - TensorEvaluator<ElseArgType, Device>::PacketAccess & - PacketType<Scalar, Device>::HasBlend, - BlockAccess = TensorEvaluator<IfArgType, Device>::BlockAccess && - TensorEvaluator<ThenArgType, Device>::BlockAccess && - TensorEvaluator<ElseArgType, Device>::BlockAccess, - PreferBlockAccess = TensorEvaluator<IfArgType, Device>::PreferBlockAccess || - TensorEvaluator<ThenArgType, Device>::PreferBlockAccess || - TensorEvaluator<ElseArgType, Device>::PreferBlockAccess, - Layout = TensorEvaluator<IfArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - TensorEvaluator(const XprType& op, const Device& device) - : m_condImpl(op.ifExpression(), device), - m_thenImpl(op.thenExpression(), device), - m_elseImpl(op.elseExpression(), device) - { - EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ThenArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ElseArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions())); - eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions())); - } - - typedef typename XprType::Index Index; - typedef typename internal::traits<XprType>::Scalar CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - static const int NumDims = internal::array_size<Dimensions>::value; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; - - typedef typename TensorEvaluator<const IfArgType, Device>::TensorBlock - IfArgTensorBlock; - typedef typename TensorEvaluator<const ThenArgType, Device>::TensorBlock - ThenArgTensorBlock; - typedef typename TensorEvaluator<const ElseArgType, Device>::TensorBlock - ElseArgTensorBlock; - - struct TensorSelectOpBlockFactory { - template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType> - struct XprType { - typedef TensorSelectOp<const IfArgXprType, const ThenArgXprType, const ElseArgXprType> type; - }; - - template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType> - typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type expr( - const IfArgXprType& if_expr, const ThenArgXprType& then_expr, const ElseArgXprType& else_expr) const { - return typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type(if_expr, then_expr, else_expr); - } - }; - - typedef internal::TensorTernaryExprBlock<TensorSelectOpBlockFactory, - IfArgTensorBlock, ThenArgTensorBlock, - ElseArgTensorBlock> - TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const - { - // TODO: use then or else impl instead if they happen to be known at compile time. - return m_condImpl.dimensions(); - } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { - m_condImpl.evalSubExprsIfNeeded(NULL); - m_thenImpl.evalSubExprsIfNeeded(NULL); - m_elseImpl.evalSubExprsIfNeeded(NULL); - return true; - } - -#ifdef EIGEN_USE_THREADS - template <typename EvalSubExprsCallback> - EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType, EvalSubExprsCallback done) { - m_condImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) { - m_thenImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) { - m_elseImpl.evalSubExprsIfNeeded(nullptr, [done](bool) { done(true); }); - }); - }); - } -#endif // EIGEN_USE_THREADS - - EIGEN_STRONG_INLINE void cleanup() { - m_condImpl.cleanup(); - m_thenImpl.cleanup(); - m_elseImpl.cleanup(); - } - - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); - } - template<int LoadMode> - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const - { - internal::Selector<PacketSize> select; - EIGEN_UNROLL_LOOP - for (Index i = 0; i < PacketSize; ++i) { - select.select[i] = m_condImpl.coeff(index+i); - } - return internal::pblend(select, - m_thenImpl.template packet<LoadMode>(index), - m_elseImpl.template packet<LoadMode>(index)); - - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - return m_condImpl.costPerCoeff(vectorized) + - m_thenImpl.costPerCoeff(vectorized) - .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - auto then_req = m_thenImpl.getResourceRequirements(); - auto else_req = m_elseImpl.getResourceRequirements(); - - auto merged_req = - internal::TensorBlockResourceRequirements::merge(then_req, else_req); - merged_req.cost_per_coeff = - then_req.cost_per_coeff.cwiseMax(else_req.cost_per_coeff); - - return internal::TensorBlockResourceRequirements::merge( - m_condImpl.getResourceRequirements(), merged_req); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - // It's unsafe to pass destination buffer to underlying expressions, because - // output might be aliased with one of the inputs. - desc.DropDestinationBuffer(); - - return TensorBlock( - m_condImpl.block(desc, scratch), m_thenImpl.block(desc, scratch), - m_elseImpl.block(desc, scratch), TensorSelectOpBlockFactory()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_condImpl.bind(cgh); - m_thenImpl.bind(cgh); - m_elseImpl.bind(cgh); - } -#endif - private: - TensorEvaluator<IfArgType, Device> m_condImpl; - TensorEvaluator<ThenArgType, Device> m_thenImpl; - TensorEvaluator<ElseArgType, Device> m_elseImpl; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorExecutor.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorExecutor.h deleted file mode 100644 index c52fb77..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorExecutor.h +++ /dev/null @@ -1,703 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H -#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H - -namespace Eigen { - -/** - * \class TensorExecutor - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor executor class. - * - * This class is responsible for launch the evaluation of the expression on - * the specified computing device. - * - * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and - * instructions) - * @tparam Tiling can use block based tensor evaluation - * (see TensorBlock.h) - */ -namespace internal { - -/** - * Evaluating TensorBroadcastingOp via coefficient of packet path is extremely - * expensive. If expression has at least one broadcast op in it, and it supports - * block based evaluation, we always prefer it, even for the small tensors. For - * all other tileable ops, block evaluation overhead for small tensors (fits - * into L1) is too large, and we fallback on vectorized evaluation. - */ - -// TODO(ezhulenev): Add specializations for all other types of Tensor ops. - -template<typename Expression> -struct ExpressionHasTensorBroadcastingOp { - enum { value = false }; -}; - -template<typename LhsXprType, typename RhsXprType> -struct ExpressionHasTensorBroadcastingOp< - const TensorAssignOp<LhsXprType, RhsXprType> > { - enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value }; -}; - -template<typename UnaryOp, typename XprType> -struct ExpressionHasTensorBroadcastingOp< - const TensorCwiseUnaryOp<UnaryOp, XprType> > { - enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value }; -}; - -template<typename BinaryOp, typename LhsXprType, typename RhsXprType> -struct ExpressionHasTensorBroadcastingOp< - const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > { - enum { - value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value || - ExpressionHasTensorBroadcastingOp<RhsXprType>::value - }; -}; - -template<typename Broadcast, typename XprType> -struct ExpressionHasTensorBroadcastingOp< - const TensorBroadcastingOp<Broadcast, XprType> > { - enum { value = true }; -}; - -// -------------------------------------------------------------------------- // - -/** - * Default strategy: the expression is evaluated sequentially with a single cpu - * thread, without vectorization and block evaluation. - */ -template <typename Expression, typename Device, bool Vectorizable, - TiledEvaluation Tiling> -class TensorExecutor { - public: - typedef typename Expression::Index StorageIndex; - - // Including `unsupported/Eigen/CXX11/Tensor` in different translation units - // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR - // violation. If this template is instantiated with a non-default device, it - // means that this header file was included without defining - // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`. - static_assert(std::is_same<Device, DefaultDevice>::value, - "Default executor instantiated with non-default device. " - "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or " - "EIGEN_USE_SYCL before including Eigen headers."); - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE void run(const Expression& expr, - const Device& device = Device()) { - TensorEvaluator<Expression, Device> evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) { - const StorageIndex size = array_prod(evaluator.dimensions()); - for (StorageIndex i = 0; i < size; ++i) { - evaluator.evalScalar(i); - } - } - evaluator.cleanup(); - } -}; - -/** - * Default async execution strategy is not implemented. Currently it's only - * available for ThreadPoolDevice (see definition below). - */ -template <typename Expression, typename Device, typename DoneCallback, - bool Vectorizable, TiledEvaluation Tiling> -class TensorAsyncExecutor {}; - -/** - * Process all the data with a single cpu thread, using vectorized instructions. - */ -template <typename Expression> -class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true, - /*Tiling=*/TiledEvaluation::Off> { - public: - typedef typename Expression::Index StorageIndex; - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE void run( - const Expression& expr, const DefaultDevice& device = DefaultDevice()) { - TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) { - const StorageIndex size = array_prod(evaluator.dimensions()); - const int PacketSize = unpacket_traits<typename TensorEvaluator< - Expression, DefaultDevice>::PacketReturnType>::size; - - // Give compiler a strong possibility to unroll the loop. But don't insist - // on unrolling, because if the function is expensive compiler should not - // unroll the loop at the expense of inlining. - const StorageIndex UnrolledSize = - (size / (4 * PacketSize)) * 4 * PacketSize; - for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) { - for (StorageIndex j = 0; j < 4; j++) { - evaluator.evalPacket(i + j * PacketSize); - } - } - const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize; - for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) { - evaluator.evalPacket(i); - } - for (StorageIndex i = VectorizedSize; i < size; ++i) { - evaluator.evalScalar(i); - } - } - evaluator.cleanup(); - } -}; - -/** - * Process all the data with a single cpu thread, using blocks of data. By - * sizing a block to fit L1 cache we get better cache performance. - */ -template <typename Expression, bool Vectorizable> -class TensorExecutor<Expression, DefaultDevice, Vectorizable, - /*Tiling=*/TiledEvaluation::On> { - public: - typedef typename traits<Expression>::Scalar Scalar; - typedef typename remove_const<Scalar>::type ScalarNoConst; - - typedef TensorEvaluator<Expression, DefaultDevice> Evaluator; - typedef typename traits<Expression>::Index StorageIndex; - - static const int NumDims = traits<Expression>::NumDimensions; - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE void run(const Expression& expr, - const DefaultDevice& device = DefaultDevice()) { - typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex> - TensorBlockMapper; - - typedef internal::TensorBlockDescriptor<NumDims, StorageIndex> - TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator<DefaultDevice> - TensorBlockScratch; - - Evaluator evaluator(expr, device); - - // TODO(ezhulenev): Do not use tiling for small tensors? - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - - if (needs_assign) { - // Query expression tree for desired block size/shape. - const TensorBlockResourceRequirements requirements = - evaluator.getResourceRequirements(); - - const TensorBlockMapper block_mapper( - typename TensorBlockDesc::Dimensions(evaluator.dimensions()), - requirements); - - // Share scratch memory allocator between all blocks. - TensorBlockScratch scratch(device); - - const StorageIndex total_block_count = block_mapper.blockCount(); - for (StorageIndex i = 0; i < total_block_count; ++i) { - TensorBlockDesc desc = block_mapper.blockDescriptor(i); - evaluator.evalBlock(desc, scratch); - scratch.reset(); - } - } - evaluator.cleanup(); - } -}; - -/** - * Multicore strategy: the index space is partitioned and each partition is - * executed on a single core. - * - * (1) TensorExecutor will submit work to the ThreadPoolDevice managed thread - * pool, and will block the caller thread until all tasks are finished. - * - * (2) TensorAsyncExecutor is a non-blocking version, that will submit work to - * the ThreadPoolDevice managed thread pool, and will return immediately. - * It will call 'done' callback after all tasks are finished. - */ -#ifdef EIGEN_USE_THREADS - -template <typename TensorBlockMapper> -struct TensorExecutorTilingContext { - TensorExecutorTilingContext() = default; - TensorExecutorTilingContext(const TensorBlockMapper& b_mapper, - const TensorOpCost& b_cost, size_t b_aligned_size) - : block_mapper(b_mapper), - cost(b_cost), - aligned_blocksize(b_aligned_size) {} - - TensorBlockMapper block_mapper; // navigate through blocks - TensorOpCost cost; // cost of computing a single block - size_t aligned_blocksize; // block size after memory alignment -}; - -// Computes a block evaluation parameters, and allocates temporary memory buffer -// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below. -template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable> -TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext( - const Evaluator& evaluator) { - // Query expression tree for desired block size/shape. - TensorBlockResourceRequirements requirements = - evaluator.getResourceRequirements(); - - // Update target block size based on cost model. - double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize( - 1, requirements.cost_per_coeff); - requirements.size = static_cast<size_t>(1.0 / taskSize); - - TensorBlockMapper block_mapper( - typename TensorBlockMapper::Dimensions(evaluator.dimensions()), - requirements); - - size_t block_size = block_mapper.blockTotalSize(); - const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); - const size_t aligned_blocksize = - align * - divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align); - - return {block_mapper, requirements.cost_per_coeff * block_size, - aligned_blocksize}; -} - -template <typename Evaluator, typename StorageIndex, bool Vectorizable> -struct EvalRange { - static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, - const StorageIndex lastIdx) { - Evaluator evaluator = *evaluator_in; - eigen_assert(lastIdx >= firstIdx); - for (StorageIndex i = firstIdx; i < lastIdx; ++i) { - evaluator.evalScalar(i); - } - } - - static StorageIndex alignBlockSize(StorageIndex size) { return size; } -}; - -template <typename Evaluator, typename StorageIndex> -struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> { - static const int PacketSize = - unpacket_traits<typename Evaluator::PacketReturnType>::size; - - static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, - const StorageIndex lastIdx) { - Evaluator evaluator = *evaluator_in; - eigen_assert(lastIdx >= firstIdx); - StorageIndex i = firstIdx; - if (lastIdx - firstIdx >= PacketSize) { - eigen_assert(firstIdx % PacketSize == 0); - StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize; - // Give compiler a strong possibility to unroll the loop. But don't insist - // on unrolling, because if the function is expensive compiler should not - // unroll the loop at the expense of inlining. - for (; i <= last_chunk_offset; i += 4 * PacketSize) { - for (StorageIndex j = 0; j < 4; j++) { - evaluator.evalPacket(i + j * PacketSize); - } - } - last_chunk_offset = lastIdx - PacketSize; - for (; i <= last_chunk_offset; i += PacketSize) { - evaluator.evalPacket(i); - } - } - for (; i < lastIdx; ++i) { - evaluator.evalScalar(i); - } - } - - static StorageIndex alignBlockSize(StorageIndex size) { - // Align block size to packet size and account for unrolling in run above. - if (size >= 16 * PacketSize) { - return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); - } - // Aligning to 4 * PacketSize would increase block size by more than 25%. - return (size + PacketSize - 1) & ~(PacketSize - 1); - } -}; - -template <typename Expression, bool Vectorizable, TiledEvaluation Tiling> -class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> { - public: - typedef typename Expression::Index StorageIndex; - - static EIGEN_STRONG_INLINE void run(const Expression& expr, - const ThreadPoolDevice& device) { - typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; - typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange; - - Evaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); - if (needs_assign) { - const StorageIndex size = array_prod(evaluator.dimensions()); - device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), - EvalRange::alignBlockSize, - [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) { - EvalRange::run(&evaluator, firstIdx, lastIdx); - }); - } - evaluator.cleanup(); - } -}; - -template <typename Expression, bool Vectorizable> -class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, - /*Tiling=*/TiledEvaluation::On> { - public: - typedef typename traits<Expression>::Index IndexType; - typedef typename traits<Expression>::Scalar Scalar; - typedef typename remove_const<Scalar>::type ScalarNoConst; - - static const int NumDims = traits<Expression>::NumDimensions; - - typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; - typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper; - typedef TensorExecutorTilingContext<BlockMapper> TilingContext; - - typedef internal::TensorBlockDescriptor<NumDims, IndexType> - TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice> - TensorBlockScratch; - - static EIGEN_STRONG_INLINE void run(const Expression& expr, - const ThreadPoolDevice& device) { - Evaluator evaluator(expr, device); - - const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); - if (needs_assign) { - const TilingContext tiling = - internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper, - Vectorizable>(evaluator); - - auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx, - IndexType lastBlockIdx) { - TensorBlockScratch scratch(device); - - for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; - ++block_idx) { - TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx); - evaluator.evalBlock(desc, scratch); - scratch.reset(); - } - }; - - // Evaluate small expressions directly as a single block. - if (tiling.block_mapper.blockCount() == 1) { - TensorBlockScratch scratch(device); - TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions()); - evaluator.evalBlock(desc, scratch); - } else { - device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost, - eval_block); - } - } - evaluator.cleanup(); - } -}; - -template <typename Expression, typename DoneCallback, bool Vectorizable, - TiledEvaluation Tiling> -class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback, - Vectorizable, Tiling> { - public: - typedef typename Expression::Index StorageIndex; - typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; - - static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, - const ThreadPoolDevice& device, - DoneCallback done) { - TensorAsyncExecutorContext* const ctx = - new TensorAsyncExecutorContext(expr, device, std::move(done)); - - const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void { - if (!need_assign) { - delete ctx; - return; - } - - typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange; - const StorageIndex size = array_prod(ctx->evaluator.dimensions()); - device.parallelForAsync( - size, ctx->evaluator.costPerCoeff(Vectorizable), - EvalRange::alignBlockSize, - [ctx](StorageIndex firstIdx, StorageIndex lastIdx) { - EvalRange::run(&ctx->evaluator, firstIdx, lastIdx); - }, - [ctx]() { delete ctx; }); - }; - - ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs); - } - - private: - struct TensorAsyncExecutorContext { - TensorAsyncExecutorContext(const Expression& expr, - const ThreadPoolDevice& thread_pool, - DoneCallback done) - : evaluator(expr, thread_pool), on_done(std::move(done)) {} - - ~TensorAsyncExecutorContext() { - evaluator.cleanup(); - on_done(); - } - - Evaluator evaluator; - - private: - DoneCallback on_done; - }; -}; - -template <typename Expression, typename DoneCallback, bool Vectorizable> -class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback, - Vectorizable, /*Tileable*/ TiledEvaluation::On> { - public: - typedef typename traits<Expression>::Index IndexType; - typedef typename traits<Expression>::Scalar Scalar; - typedef typename remove_const<Scalar>::type ScalarNoConst; - - static const int NumDims = traits<Expression>::NumDimensions; - - typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; - typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper; - typedef TensorExecutorTilingContext<BlockMapper> TilingContext; - - typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice> - TensorBlockScratch; - - static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, - const ThreadPoolDevice& device, - DoneCallback done) { - - TensorAsyncExecutorContext* const ctx = - new TensorAsyncExecutorContext(expr, device, std::move(done)); - - const auto on_eval_subexprs = [ctx](bool need_assign) -> void { - if (!need_assign) { - delete ctx; - return; - } - - ctx->tiling = internal::GetTensorExecutorTilingContext< - Evaluator, BlockMapper, Vectorizable>(ctx->evaluator); - - auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) { - TensorBlockScratch scratch(ctx->device); - - for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; - ++block_idx) { - TensorBlockDesc desc = - ctx->tiling.block_mapper.blockDescriptor(block_idx); - ctx->evaluator.evalBlock(desc, scratch); - scratch.reset(); - } - }; - - // Evaluate small expressions directly as a single block. - if (ctx->tiling.block_mapper.blockCount() == 1) { - TensorBlockScratch scratch(ctx->device); - TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions()); - ctx->evaluator.evalBlock(desc, scratch); - delete ctx; - } else { - ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(), - ctx->tiling.cost, eval_block, - [ctx]() { delete ctx; }); - } - }; - - ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs); - } - - private: - struct TensorAsyncExecutorContext { - TensorAsyncExecutorContext(const Expression& expr, - const ThreadPoolDevice& thread_pool, - DoneCallback done) - : device(thread_pool), - evaluator(expr, thread_pool), - on_done(std::move(done)) {} - - ~TensorAsyncExecutorContext() { - evaluator.cleanup(); - on_done(); - } - - const ThreadPoolDevice& device; - Evaluator evaluator; - TilingContext tiling; - - private: - DoneCallback on_done; - }; -}; - -#endif // EIGEN_USE_THREADS - -// GPU: the evaluation of the expression is offloaded to a GPU. -#if defined(EIGEN_USE_GPU) - -template <typename Expression, bool Vectorizable, TiledEvaluation Tiling> -class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> { - public: - typedef typename Expression::Index StorageIndex; - static void run(const Expression& expr, const GpuDevice& device); -}; - -#if defined(EIGEN_GPUCC) -template <typename Evaluator, typename StorageIndex, bool Vectorizable> -struct EigenMetaKernelEval { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) { - for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) { - eval.evalScalar(i); - } - } -}; - -template <typename Evaluator, typename StorageIndex> -struct EigenMetaKernelEval<Evaluator, StorageIndex, true> { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) { - const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; - const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize; - const StorageIndex vectorized_step_size = step_size * PacketSize; - - // Use the vector path - for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size; - i += vectorized_step_size) { - eval.evalPacket(i); - } - for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) { - eval.evalScalar(i); - } - } -}; - -template <typename Evaluator, typename StorageIndex> -__global__ void -__launch_bounds__(1024) -EigenMetaKernel(Evaluator eval, StorageIndex size) { - - const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x; - const StorageIndex step_size = blockDim.x * gridDim.x; - - const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; - EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size); -} - -/*static*/ -template <typename Expression, bool Vectorizable, TiledEvaluation Tiling> -EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run( - const Expression& expr, const GpuDevice& device) { - TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); - if (needs_assign) { - - const int block_size = device.maxGpuThreadsPerBlock(); - const int max_blocks = device.getNumGpuMultiProcessors() * - device.maxGpuThreadsPerMultiProcessor() / block_size; - const StorageIndex size = array_prod(evaluator.dimensions()); - // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. - const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1); - - LAUNCH_GPU_KERNEL( - (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>), - num_blocks, block_size, 0, device, evaluator, size); - } - evaluator.cleanup(); -} - -#endif // EIGEN_GPUCC -#endif // EIGEN_USE_GPU - -// SYCL Executor policy -#ifdef EIGEN_USE_SYCL - -template <typename Evaluator> -struct ExecExprFunctorKernel { - typedef typename Evaluator::Index Index; - Evaluator evaluator; - const Index range; - template <typename Scratch> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel( - const Scratch, Evaluator evaluator_, const Index range_) - : evaluator(evaluator_), range(range_) {} - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()( - cl::sycl::nd_item<1> itemID) { - compute(itemID); - } - template <bool is_vec = Evaluator::PacketAccess> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<!is_vec>::type - compute(const cl::sycl::nd_item<1>& itemID) { - Index gId = static_cast<Index>(itemID.get_global_linear_id()); - Index total_threads = itemID.get_global_range(0); - - for (Index i = gId; i < range; i += total_threads) { - evaluator.evalScalar(i); - } - } - template <bool is_vec = Evaluator::PacketAccess> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<is_vec>::type - compute(const cl::sycl::nd_item<1>& itemID) { - const Index vectorizedRange = - (range / Evaluator::PacketSize) * Evaluator::PacketSize; - Index gId = static_cast<Index>(itemID.get_global_linear_id()); - const Index step = Evaluator::PacketSize * itemID.get_global_range(0); - const Index start = Evaluator::PacketSize * gId; - for (Index i = start; i < vectorizedRange; i += step) { - evaluator.evalPacket(i); - } - gId += vectorizedRange; - for (Index i = gId; i < range; i += itemID.get_global_range(0)) { - evaluator.evalScalar(i); - } - } -}; - -template <typename Expression, bool Vectorizable, TiledEvaluation Tiling> -class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> { - public: - typedef typename Expression::Index Index; - static EIGEN_STRONG_INLINE void run(const Expression& expr, - const Eigen::SyclDevice& dev) { - typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator; - Evaluator evaluator(expr, dev); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) { - Index range, GRange, tileSize; - Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions()); - total_size = (total_size == 0) ? 1 : total_size; - const int PacketSize = - Eigen::PacketType<typename Evaluator::CoeffReturnType, - Eigen::SyclDevice>::size; - Index vectorizable_threads = static_cast<Index>(total_size / PacketSize); - dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange); - range = total_size; - - dev.template nullary_kernel_launcher< - typename Evaluator::CoeffReturnType, - ExecExprFunctorKernel<Evaluator> >( - evaluator, - cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), - cl::sycl::range<1>(tileSize)), - Index(1), range); - } - evaluator.cleanup(); - } -}; - -#endif - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorExpr.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorExpr.h deleted file mode 100644 index c9bccfc..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorExpr.h +++ /dev/null @@ -1,388 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H -#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H - -namespace Eigen { - -/** \class TensorExpr - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor expression classes. - * - * The TensorCwiseNullaryOp class applies a nullary operators to an expression. - * This is typically used to generate constants. - * - * The TensorCwiseUnaryOp class represents an expression where a unary operator - * (e.g. cwiseSqrt) is applied to an expression. - * - * The TensorCwiseBinaryOp class represents an expression where a binary - * operator (e.g. addition) is applied to a lhs and a rhs expression. - * - */ -namespace internal { -template<typename NullaryOp, typename XprType> -struct traits<TensorCwiseNullaryOp<NullaryOp, XprType> > - : traits<XprType> -{ - typedef traits<XprType> XprTraits; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::Nested XprTypeNested; - typedef typename remove_reference<XprTypeNested>::type _XprTypeNested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; - enum { - Flags = 0 - }; -}; - -} // end namespace internal - - - -template<typename NullaryOp, typename XprType> -class TensorCwiseNullaryOp : public TensorBase<TensorCwiseNullaryOp<NullaryOp, XprType>, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef TensorCwiseNullaryOp<NullaryOp, XprType> Nested; - typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp()) - : m_xpr(xpr), m_functor(func) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - nestedExpression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC - const NullaryOp& functor() const { return m_functor; } - - protected: - typename XprType::Nested m_xpr; - const NullaryOp m_functor; -}; - - - -namespace internal { -template<typename UnaryOp, typename XprType> -struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> > - : traits<XprType> -{ - // TODO(phli): Add InputScalar, InputPacket. Check references to - // current Scalar/Packet to see if the intent is Input or Output. - typedef typename result_of<UnaryOp(typename XprType::Scalar)>::type Scalar; - typedef traits<XprType> XprTraits; - typedef typename XprType::Nested XprTypeNested; - typedef typename remove_reference<XprTypeNested>::type _XprTypeNested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - typedef typename TypeConversion<Scalar, - typename XprTraits::PointerType - >::type - PointerType; -}; - -template<typename UnaryOp, typename XprType> -struct eval<TensorCwiseUnaryOp<UnaryOp, XprType>, Eigen::Dense> -{ - typedef const TensorCwiseUnaryOp<UnaryOp, XprType>& type; -}; - -template<typename UnaryOp, typename XprType> -struct nested<TensorCwiseUnaryOp<UnaryOp, XprType>, 1, typename eval<TensorCwiseUnaryOp<UnaryOp, XprType> >::type> -{ - typedef TensorCwiseUnaryOp<UnaryOp, XprType> type; -}; - -} // end namespace internal - - - -template<typename UnaryOp, typename XprType> -class TensorCwiseUnaryOp : public TensorBase<TensorCwiseUnaryOp<UnaryOp, XprType>, ReadOnlyAccessors> -{ - public: - // TODO(phli): Add InputScalar, InputPacket. Check references to - // current Scalar/Packet to see if the intent is Input or Output. - typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef Scalar CoeffReturnType; - typedef typename Eigen::internal::nested<TensorCwiseUnaryOp>::type Nested; - typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) - : m_xpr(xpr), m_functor(func) {} - - EIGEN_DEVICE_FUNC - const UnaryOp& functor() const { return m_functor; } - - /** \returns the nested expression */ - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - nestedExpression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const UnaryOp m_functor; -}; - - -namespace internal { -template<typename BinaryOp, typename LhsXprType, typename RhsXprType> -struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > -{ - // Type promotion to handle the case where the types of the lhs and the rhs - // are different. - // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to - // current Scalar/Packet to see if the intent is Inputs or Output. - typedef typename result_of< - BinaryOp(typename LhsXprType::Scalar, - typename RhsXprType::Scalar)>::type Scalar; - typedef traits<LhsXprType> XprTraits; - typedef typename promote_storage_type< - typename traits<LhsXprType>::StorageKind, - typename traits<RhsXprType>::StorageKind>::ret StorageKind; - typedef typename promote_index_type< - typename traits<LhsXprType>::Index, - typename traits<RhsXprType>::Index>::type Index; - typedef typename LhsXprType::Nested LhsNested; - typedef typename RhsXprType::Nested RhsNested; - typedef typename remove_reference<LhsNested>::type _LhsNested; - typedef typename remove_reference<RhsNested>::type _RhsNested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - typedef typename TypeConversion<Scalar, - typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val, - typename traits<LhsXprType>::PointerType, - typename traits<RhsXprType>::PointerType>::type - >::type - PointerType; - enum { - Flags = 0 - }; -}; - -template<typename BinaryOp, typename LhsXprType, typename RhsXprType> -struct eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, Eigen::Dense> -{ - typedef const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>& type; -}; - -template<typename BinaryOp, typename LhsXprType, typename RhsXprType> -struct nested<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, 1, typename eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >::type> -{ - typedef TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> type; -}; - -} // end namespace internal - - - -template<typename BinaryOp, typename LhsXprType, typename RhsXprType> -class TensorCwiseBinaryOp : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, ReadOnlyAccessors> -{ - public: - // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to - // current Scalar/Packet to see if the intent is Inputs or Output. - typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef Scalar CoeffReturnType; - typedef typename Eigen::internal::nested<TensorCwiseBinaryOp>::type Nested; - typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp()) - : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {} - - EIGEN_DEVICE_FUNC - const BinaryOp& functor() const { return m_functor; } - - /** \returns the nested expressions */ - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename LhsXprType::Nested>::type& - lhsExpression() const { return m_lhs_xpr; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename RhsXprType::Nested>::type& - rhsExpression() const { return m_rhs_xpr; } - - protected: - typename LhsXprType::Nested m_lhs_xpr; - typename RhsXprType::Nested m_rhs_xpr; - const BinaryOp m_functor; -}; - - -namespace internal { -template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> -struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> > -{ - // Type promotion to handle the case where the types of the args are different. - typedef typename result_of< - TernaryOp(typename Arg1XprType::Scalar, - typename Arg2XprType::Scalar, - typename Arg3XprType::Scalar)>::type Scalar; - typedef traits<Arg1XprType> XprTraits; - typedef typename traits<Arg1XprType>::StorageKind StorageKind; - typedef typename traits<Arg1XprType>::Index Index; - typedef typename Arg1XprType::Nested Arg1Nested; - typedef typename Arg2XprType::Nested Arg2Nested; - typedef typename Arg3XprType::Nested Arg3Nested; - typedef typename remove_reference<Arg1Nested>::type _Arg1Nested; - typedef typename remove_reference<Arg2Nested>::type _Arg2Nested; - typedef typename remove_reference<Arg3Nested>::type _Arg3Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - typedef typename TypeConversion<Scalar, - typename conditional<Pointer_type_promotion<typename Arg2XprType::Scalar, Scalar>::val, - typename traits<Arg2XprType>::PointerType, - typename traits<Arg3XprType>::PointerType>::type - >::type - PointerType; - enum { - Flags = 0 - }; -}; - -template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> -struct eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, Eigen::Dense> -{ - typedef const TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>& type; -}; - -template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> -struct nested<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, 1, typename eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >::type> -{ - typedef TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> type; -}; - -} // end namespace internal - - - -template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> -class TensorCwiseTernaryOp : public TensorBase<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef Scalar CoeffReturnType; - typedef typename Eigen::internal::nested<TensorCwiseTernaryOp>::type Nested; - typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2, const Arg3XprType& arg3, const TernaryOp& func = TernaryOp()) - : m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {} - - EIGEN_DEVICE_FUNC - const TernaryOp& functor() const { return m_functor; } - - /** \returns the nested expressions */ - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename Arg1XprType::Nested>::type& - arg1Expression() const { return m_arg1_xpr; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename Arg2XprType::Nested>::type& - arg2Expression() const { return m_arg2_xpr; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename Arg3XprType::Nested>::type& - arg3Expression() const { return m_arg3_xpr; } - - protected: - typename Arg1XprType::Nested m_arg1_xpr; - typename Arg2XprType::Nested m_arg2_xpr; - typename Arg3XprType::Nested m_arg3_xpr; - const TernaryOp m_functor; -}; - - -namespace internal { -template<typename IfXprType, typename ThenXprType, typename ElseXprType> -struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> > - : traits<ThenXprType> -{ - typedef typename traits<ThenXprType>::Scalar Scalar; - typedef traits<ThenXprType> XprTraits; - typedef typename promote_storage_type<typename traits<ThenXprType>::StorageKind, - typename traits<ElseXprType>::StorageKind>::ret StorageKind; - typedef typename promote_index_type<typename traits<ElseXprType>::Index, - typename traits<ThenXprType>::Index>::type Index; - typedef typename IfXprType::Nested IfNested; - typedef typename ThenXprType::Nested ThenNested; - typedef typename ElseXprType::Nested ElseNested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - typedef typename conditional<Pointer_type_promotion<typename ThenXprType::Scalar, Scalar>::val, - typename traits<ThenXprType>::PointerType, - typename traits<ElseXprType>::PointerType>::type PointerType; -}; - -template<typename IfXprType, typename ThenXprType, typename ElseXprType> -struct eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, Eigen::Dense> -{ - typedef const TensorSelectOp<IfXprType, ThenXprType, ElseXprType>& type; -}; - -template<typename IfXprType, typename ThenXprType, typename ElseXprType> -struct nested<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, 1, typename eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >::type> -{ - typedef TensorSelectOp<IfXprType, ThenXprType, ElseXprType> type; -}; - -} // end namespace internal - - -template<typename IfXprType, typename ThenXprType, typename ElseXprType> -class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits<TensorSelectOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename internal::promote_storage_type<typename ThenXprType::CoeffReturnType, - typename ElseXprType::CoeffReturnType>::ret CoeffReturnType; - typedef typename Eigen::internal::nested<TensorSelectOp>::type Nested; - typedef typename Eigen::internal::traits<TensorSelectOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorSelectOp>::Index Index; - - EIGEN_DEVICE_FUNC - TensorSelectOp(const IfXprType& a_condition, - const ThenXprType& a_then, - const ElseXprType& a_else) - : m_condition(a_condition), m_then(a_then), m_else(a_else) - { } - - EIGEN_DEVICE_FUNC - const IfXprType& ifExpression() const { return m_condition; } - - EIGEN_DEVICE_FUNC - const ThenXprType& thenExpression() const { return m_then; } - - EIGEN_DEVICE_FUNC - const ElseXprType& elseExpression() const { return m_else; } - - protected: - typename IfXprType::Nested m_condition; - typename ThenXprType::Nested m_then; - typename ElseXprType::Nested m_else; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorFFT.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorFFT.h deleted file mode 100644 index 4a1a068..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorFFT.h +++ /dev/null @@ -1,669 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Jianwei Cui <thucjw@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H -#define EIGEN_CXX11_TENSOR_TENSOR_FFT_H - -namespace Eigen { - -/** \class TensorFFT - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor FFT class. - * - * TODO: - * Vectorize the Cooley Tukey and the Bluestein algorithm - * Add support for multithreaded evaluation - * Improve the performance on GPU - */ - -template <bool NeedUprade> struct MakeComplex { - template <typename T> - EIGEN_DEVICE_FUNC - T operator() (const T& val) const { return val; } -}; - -template <> struct MakeComplex<true> { - template <typename T> - EIGEN_DEVICE_FUNC - std::complex<T> operator() (const T& val) const { return std::complex<T>(val, 0); } -}; - -template <> struct MakeComplex<false> { - template <typename T> - EIGEN_DEVICE_FUNC - std::complex<T> operator() (const std::complex<T>& val) const { return val; } -}; - -template <int ResultType> struct PartOf { - template <typename T> T operator() (const T& val) const { return val; } -}; - -template <> struct PartOf<RealPart> { - template <typename T> T operator() (const std::complex<T>& val) const { return val.real(); } -}; - -template <> struct PartOf<ImagPart> { - template <typename T> T operator() (const std::complex<T>& val) const { return val.imag(); } -}; - -namespace internal { -template <typename FFT, typename XprType, int FFTResultType, int FFTDir> -struct traits<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir> > : public traits<XprType> { - typedef traits<XprType> XprTraits; - typedef typename NumTraits<typename XprTraits::Scalar>::Real RealScalar; - typedef typename std::complex<RealScalar> ComplexScalar; - typedef typename XprTraits::Scalar InputScalar; - typedef typename conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - typedef typename traits<XprType>::PointerType PointerType; -}; - -template <typename FFT, typename XprType, int FFTResultType, int FFTDirection> -struct eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, Eigen::Dense> { - typedef const TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>& type; -}; - -template <typename FFT, typename XprType, int FFTResultType, int FFTDirection> -struct nested<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, 1, typename eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> >::type> { - typedef TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> type; -}; - -} // end namespace internal - -template <typename FFT, typename XprType, int FFTResultType, int FFTDir> -class TensorFFTOp : public TensorBase<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir>, ReadOnlyAccessors> { - public: - typedef typename Eigen::internal::traits<TensorFFTOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename std::complex<RealScalar> ComplexScalar; - typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar; - typedef OutputScalar CoeffReturnType; - typedef typename Eigen::internal::nested<TensorFFTOp>::type Nested; - typedef typename Eigen::internal::traits<TensorFFTOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorFFTOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFFTOp(const XprType& expr, const FFT& fft) - : m_xpr(expr), m_fft(fft) {} - - EIGEN_DEVICE_FUNC - const FFT& fft() const { return m_fft; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& expression() const { - return m_xpr; - } - - protected: - typename XprType::Nested m_xpr; - const FFT m_fft; -}; - -// Eval as rvalue -template <typename FFT, typename ArgType, typename Device, int FFTResultType, int FFTDir> -struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, Device> { - typedef TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir> XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; - typedef DSizes<Index, NumDims> Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename std::complex<RealScalar> ComplexScalar; - typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions; - typedef internal::traits<XprType> XprTraits; - typedef typename XprTraits::Scalar InputScalar; - typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar; - typedef OutputScalar CoeffReturnType; - typedef typename PacketType<OutputScalar, Device>::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - IsAligned = false, - PacketAccess = true, - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { - const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - eigen_assert(input_dims[i] > 0); - m_dimensions[i] = input_dims[i]; - } - - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - m_strides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; - } - } else { - m_strides[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; - } - } - m_size = m_dimensions.TotalSize(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { - return m_dimensions; - } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { - m_impl.evalSubExprsIfNeeded(NULL); - if (data) { - evalToBuf(data); - return false; - } else { - m_data = (EvaluatorPointerType)m_device.get((CoeffReturnType*)(m_device.allocate_temp(sizeof(CoeffReturnType) * m_size))); - evalToBuf(m_data); - return true; - } - } - - EIGEN_STRONG_INLINE void cleanup() { - if (m_data) { - m_device.deallocate(m_data); - m_data = NULL; - } - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const { - return m_data[index]; - } - - template <int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType - packet(Index index) const { - return internal::ploadt<PacketReturnType, LoadMode>(m_data + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_data.bind(cgh); - } -#endif - - private: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) { - const bool write_to_out = internal::is_same<OutputScalar, ComplexScalar>::value; - ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size); - - for (Index i = 0; i < m_size; ++i) { - buf[i] = MakeComplex<internal::is_same<InputScalar, RealScalar>::value>()(m_impl.coeff(i)); - } - - for (size_t i = 0; i < m_fft.size(); ++i) { - Index dim = m_fft[i]; - eigen_assert(dim >= 0 && dim < NumDims); - Index line_len = m_dimensions[dim]; - eigen_assert(line_len >= 1); - ComplexScalar* line_buf = (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * line_len); - const bool is_power_of_two = isPowerOfTwo(line_len); - const Index good_composite = is_power_of_two ? 0 : findGoodComposite(line_len); - const Index log_len = is_power_of_two ? getLog2(line_len) : getLog2(good_composite); - - ComplexScalar* a = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite); - ComplexScalar* b = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite); - ComplexScalar* pos_j_base_powered = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * (line_len + 1)); - if (!is_power_of_two) { - // Compute twiddle factors - // t_n = exp(sqrt(-1) * pi * n^2 / line_len) - // for n = 0, 1,..., line_len-1. - // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2 - - // The recurrence is correct in exact arithmetic, but causes - // numerical issues for large transforms, especially in - // single-precision floating point. - // - // pos_j_base_powered[0] = ComplexScalar(1, 0); - // if (line_len > 1) { - // const ComplexScalar pos_j_base = ComplexScalar( - // numext::cos(M_PI / line_len), numext::sin(M_PI / line_len)); - // pos_j_base_powered[1] = pos_j_base; - // if (line_len > 2) { - // const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; - // for (int i = 2; i < line_len + 1; ++i) { - // pos_j_base_powered[i] = pos_j_base_powered[i - 1] * - // pos_j_base_powered[i - 1] / - // pos_j_base_powered[i - 2] * - // pos_j_base_sq; - // } - // } - // } - // TODO(rmlarsen): Find a way to use Eigen's vectorized sin - // and cosine functions here. - for (int j = 0; j < line_len + 1; ++j) { - double arg = ((EIGEN_PI * j) * j) / line_len; - std::complex<double> tmp(numext::cos(arg), numext::sin(arg)); - pos_j_base_powered[j] = static_cast<ComplexScalar>(tmp); - } - } - - for (Index partial_index = 0; partial_index < m_size / line_len; ++partial_index) { - const Index base_offset = getBaseOffsetFromIndex(partial_index, dim); - - // get data into line_buf - const Index stride = m_strides[dim]; - if (stride == 1) { - m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); - } else { - Index offset = base_offset; - for (int j = 0; j < line_len; ++j, offset += stride) { - line_buf[j] = buf[offset]; - } - } - - // process the line - if (is_power_of_two) { - processDataLineCooleyTukey(line_buf, line_len, log_len); - } - else { - processDataLineBluestein(line_buf, line_len, good_composite, log_len, a, b, pos_j_base_powered); - } - - // write back - if (FFTDir == FFT_FORWARD && stride == 1) { - m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); - } else { - Index offset = base_offset; - const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0); - for (int j = 0; j < line_len; ++j, offset += stride) { - buf[offset] = (FFTDir == FFT_FORWARD) ? line_buf[j] : line_buf[j] * div_factor; - } - } - } - m_device.deallocate(line_buf); - if (!is_power_of_two) { - m_device.deallocate(a); - m_device.deallocate(b); - m_device.deallocate(pos_j_base_powered); - } - } - - if(!write_to_out) { - for (Index i = 0; i < m_size; ++i) { - data[i] = PartOf<FFTResultType>()(buf[i]); - } - m_device.deallocate(buf); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool isPowerOfTwo(Index x) { - eigen_assert(x > 0); - return !(x & (x - 1)); - } - - // The composite number for padding, used in Bluestein's FFT algorithm - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index findGoodComposite(Index n) { - Index i = 2; - while (i < 2 * n - 1) i *= 2; - return i; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index getLog2(Index m) { - Index log2m = 0; - while (m >>= 1) log2m++; - return log2m; - } - - // Call Cooley Tukey algorithm directly, data length must be power of 2 - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineCooleyTukey(ComplexScalar* line_buf, Index line_len, Index log_len) { - eigen_assert(isPowerOfTwo(line_len)); - scramble_FFT(line_buf, line_len); - compute_1D_Butterfly<FFTDir>(line_buf, line_len, log_len); - } - - // Call Bluestein's FFT algorithm, m is a good composite number greater than (2 * n - 1), used as the padding length - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineBluestein(ComplexScalar* line_buf, Index line_len, Index good_composite, Index log_len, ComplexScalar* a, ComplexScalar* b, const ComplexScalar* pos_j_base_powered) { - Index n = line_len; - Index m = good_composite; - ComplexScalar* data = line_buf; - - for (Index i = 0; i < n; ++i) { - if(FFTDir == FFT_FORWARD) { - a[i] = data[i] * numext::conj(pos_j_base_powered[i]); - } - else { - a[i] = data[i] * pos_j_base_powered[i]; - } - } - for (Index i = n; i < m; ++i) { - a[i] = ComplexScalar(0, 0); - } - - for (Index i = 0; i < n; ++i) { - if(FFTDir == FFT_FORWARD) { - b[i] = pos_j_base_powered[i]; - } - else { - b[i] = numext::conj(pos_j_base_powered[i]); - } - } - for (Index i = n; i < m - n; ++i) { - b[i] = ComplexScalar(0, 0); - } - for (Index i = m - n; i < m; ++i) { - if(FFTDir == FFT_FORWARD) { - b[i] = pos_j_base_powered[m-i]; - } - else { - b[i] = numext::conj(pos_j_base_powered[m-i]); - } - } - - scramble_FFT(a, m); - compute_1D_Butterfly<FFT_FORWARD>(a, m, log_len); - - scramble_FFT(b, m); - compute_1D_Butterfly<FFT_FORWARD>(b, m, log_len); - - for (Index i = 0; i < m; ++i) { - a[i] *= b[i]; - } - - scramble_FFT(a, m); - compute_1D_Butterfly<FFT_REVERSE>(a, m, log_len); - - //Do the scaling after ifft - for (Index i = 0; i < m; ++i) { - a[i] /= m; - } - - for (Index i = 0; i < n; ++i) { - if(FFTDir == FFT_FORWARD) { - data[i] = a[i] * numext::conj(pos_j_base_powered[i]); - } - else { - data[i] = a[i] * pos_j_base_powered[i]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void scramble_FFT(ComplexScalar* data, Index n) { - eigen_assert(isPowerOfTwo(n)); - Index j = 1; - for (Index i = 1; i < n; ++i){ - if (j > i) { - std::swap(data[j-1], data[i-1]); - } - Index m = n >> 1; - while (m >= 2 && j > m) { - j -= m; - m >>= 1; - } - j += m; - } - } - - template <int Dir> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_2(ComplexScalar* data) { - ComplexScalar tmp = data[1]; - data[1] = data[0] - data[1]; - data[0] += tmp; - } - - template <int Dir> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_4(ComplexScalar* data) { - ComplexScalar tmp[4]; - tmp[0] = data[0] + data[1]; - tmp[1] = data[0] - data[1]; - tmp[2] = data[2] + data[3]; - if (Dir == FFT_FORWARD) { - tmp[3] = ComplexScalar(0.0, -1.0) * (data[2] - data[3]); - } else { - tmp[3] = ComplexScalar(0.0, 1.0) * (data[2] - data[3]); - } - data[0] = tmp[0] + tmp[2]; - data[1] = tmp[1] + tmp[3]; - data[2] = tmp[0] - tmp[2]; - data[3] = tmp[1] - tmp[3]; - } - - template <int Dir> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_8(ComplexScalar* data) { - ComplexScalar tmp_1[8]; - ComplexScalar tmp_2[8]; - - tmp_1[0] = data[0] + data[1]; - tmp_1[1] = data[0] - data[1]; - tmp_1[2] = data[2] + data[3]; - if (Dir == FFT_FORWARD) { - tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, -1); - } else { - tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, 1); - } - tmp_1[4] = data[4] + data[5]; - tmp_1[5] = data[4] - data[5]; - tmp_1[6] = data[6] + data[7]; - if (Dir == FFT_FORWARD) { - tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, -1); - } else { - tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, 1); - } - tmp_2[0] = tmp_1[0] + tmp_1[2]; - tmp_2[1] = tmp_1[1] + tmp_1[3]; - tmp_2[2] = tmp_1[0] - tmp_1[2]; - tmp_2[3] = tmp_1[1] - tmp_1[3]; - tmp_2[4] = tmp_1[4] + tmp_1[6]; -// SQRT2DIV2 = sqrt(2)/2 -#define SQRT2DIV2 0.7071067811865476 - if (Dir == FFT_FORWARD) { - tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, -SQRT2DIV2); - tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, -1); - tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, -SQRT2DIV2); - } else { - tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, SQRT2DIV2); - tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, 1); - tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, SQRT2DIV2); - } - data[0] = tmp_2[0] + tmp_2[4]; - data[1] = tmp_2[1] + tmp_2[5]; - data[2] = tmp_2[2] + tmp_2[6]; - data[3] = tmp_2[3] + tmp_2[7]; - data[4] = tmp_2[0] - tmp_2[4]; - data[5] = tmp_2[1] - tmp_2[5]; - data[6] = tmp_2[2] - tmp_2[6]; - data[7] = tmp_2[3] - tmp_2[7]; - } - - template <int Dir> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge( - ComplexScalar* data, Index n, Index n_power_of_2) { - // Original code: - // RealScalar wtemp = std::sin(M_PI/n); - // RealScalar wpi = -std::sin(2 * M_PI/n); - const RealScalar wtemp = m_sin_PI_div_n_LUT[n_power_of_2]; - const RealScalar wpi = (Dir == FFT_FORWARD) - ? m_minus_sin_2_PI_div_n_LUT[n_power_of_2] - : -m_minus_sin_2_PI_div_n_LUT[n_power_of_2]; - - const ComplexScalar wp(wtemp, wpi); - const ComplexScalar wp_one = wp + ComplexScalar(1, 0); - const ComplexScalar wp_one_2 = wp_one * wp_one; - const ComplexScalar wp_one_3 = wp_one_2 * wp_one; - const ComplexScalar wp_one_4 = wp_one_3 * wp_one; - const Index n2 = n / 2; - ComplexScalar w(1.0, 0.0); - for (Index i = 0; i < n2; i += 4) { - ComplexScalar temp0(data[i + n2] * w); - ComplexScalar temp1(data[i + 1 + n2] * w * wp_one); - ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2); - ComplexScalar temp3(data[i + 3 + n2] * w * wp_one_3); - w = w * wp_one_4; - - data[i + n2] = data[i] - temp0; - data[i] += temp0; - - data[i + 1 + n2] = data[i + 1] - temp1; - data[i + 1] += temp1; - - data[i + 2 + n2] = data[i + 2] - temp2; - data[i + 2] += temp2; - - data[i + 3 + n2] = data[i + 3] - temp3; - data[i + 3] += temp3; - } - } - - template <int Dir> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly( - ComplexScalar* data, Index n, Index n_power_of_2) { - eigen_assert(isPowerOfTwo(n)); - if (n > 8) { - compute_1D_Butterfly<Dir>(data, n / 2, n_power_of_2 - 1); - compute_1D_Butterfly<Dir>(data + n / 2, n / 2, n_power_of_2 - 1); - butterfly_1D_merge<Dir>(data, n, n_power_of_2); - } else if (n == 8) { - butterfly_8<Dir>(data); - } else if (n == 4) { - butterfly_4<Dir>(data); - } else if (n == 2) { - butterfly_2<Dir>(data); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getBaseOffsetFromIndex(Index index, Index omitted_dim) const { - Index result = 0; - - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - for (int i = NumDims - 1; i > omitted_dim; --i) { - const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim]; - const Index idx = index / partial_m_stride; - index -= idx * partial_m_stride; - result += idx * m_strides[i]; - } - result += index; - } - else { - for (Index i = 0; i < omitted_dim; ++i) { - const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim]; - const Index idx = index / partial_m_stride; - index -= idx * partial_m_stride; - result += idx * m_strides[i]; - } - result += index; - } - // Value of index_coords[omitted_dim] is not determined to this step - return result; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndexFromOffset(Index base, Index omitted_dim, Index offset) const { - Index result = base + offset * m_strides[omitted_dim] ; - return result; - } - - protected: - Index m_size; - const FFT EIGEN_DEVICE_REF m_fft; - Dimensions m_dimensions; - array<Index, NumDims> m_strides; - TensorEvaluator<ArgType, Device> m_impl; - EvaluatorPointerType m_data; - const Device EIGEN_DEVICE_REF m_device; - - // This will support a maximum FFT size of 2^32 for each dimension - // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2; - const RealScalar m_sin_PI_div_n_LUT[32] = { - RealScalar(0.0), - RealScalar(-2), - RealScalar(-0.999999999999999), - RealScalar(-0.292893218813453), - RealScalar(-0.0761204674887130), - RealScalar(-0.0192147195967696), - RealScalar(-0.00481527332780311), - RealScalar(-0.00120454379482761), - RealScalar(-3.01181303795779e-04), - RealScalar(-7.52981608554592e-05), - RealScalar(-1.88247173988574e-05), - RealScalar(-4.70619042382852e-06), - RealScalar(-1.17654829809007e-06), - RealScalar(-2.94137117780840e-07), - RealScalar(-7.35342821488550e-08), - RealScalar(-1.83835707061916e-08), - RealScalar(-4.59589268710903e-09), - RealScalar(-1.14897317243732e-09), - RealScalar(-2.87243293150586e-10), - RealScalar( -7.18108232902250e-11), - RealScalar(-1.79527058227174e-11), - RealScalar(-4.48817645568941e-12), - RealScalar(-1.12204411392298e-12), - RealScalar(-2.80511028480785e-13), - RealScalar(-7.01277571201985e-14), - RealScalar(-1.75319392800498e-14), - RealScalar(-4.38298482001247e-15), - RealScalar(-1.09574620500312e-15), - RealScalar(-2.73936551250781e-16), - RealScalar(-6.84841378126949e-17), - RealScalar(-1.71210344531737e-17), - RealScalar(-4.28025861329343e-18) - }; - - // m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i)); - const RealScalar m_minus_sin_2_PI_div_n_LUT[32] = { - RealScalar(0.0), - RealScalar(0.0), - RealScalar(-1.00000000000000e+00), - RealScalar(-7.07106781186547e-01), - RealScalar(-3.82683432365090e-01), - RealScalar(-1.95090322016128e-01), - RealScalar(-9.80171403295606e-02), - RealScalar(-4.90676743274180e-02), - RealScalar(-2.45412285229123e-02), - RealScalar(-1.22715382857199e-02), - RealScalar(-6.13588464915448e-03), - RealScalar(-3.06795676296598e-03), - RealScalar(-1.53398018628477e-03), - RealScalar(-7.66990318742704e-04), - RealScalar(-3.83495187571396e-04), - RealScalar(-1.91747597310703e-04), - RealScalar(-9.58737990959773e-05), - RealScalar(-4.79368996030669e-05), - RealScalar(-2.39684498084182e-05), - RealScalar(-1.19842249050697e-05), - RealScalar(-5.99211245264243e-06), - RealScalar(-2.99605622633466e-06), - RealScalar(-1.49802811316901e-06), - RealScalar(-7.49014056584716e-07), - RealScalar(-3.74507028292384e-07), - RealScalar(-1.87253514146195e-07), - RealScalar(-9.36267570730981e-08), - RealScalar(-4.68133785365491e-08), - RealScalar(-2.34066892682746e-08), - RealScalar(-1.17033446341373e-08), - RealScalar(-5.85167231706864e-09), - RealScalar(-2.92583615853432e-09) - }; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_FFT_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorFixedSize.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorFixedSize.h deleted file mode 100644 index ca39bb8..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorFixedSize.h +++ /dev/null @@ -1,379 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H -#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H - -namespace Eigen { - -/** \class TensorFixedSize - * \ingroup CXX11_Tensor_Module - * - * \brief The fixed sized version of the tensor class. - * - * The fixed sized equivalent of - * Eigen::Tensor<float, 3> t(3, 5, 7); - * is - * Eigen::TensorFixedSize<float, Sizes<3,5,7>> t; - */ - -template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType> -class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> > -{ - public: - typedef TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> Self; - typedef TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> > Base; - typedef typename Eigen::internal::nested<Self>::type Nested; - typedef typename internal::traits<Self>::StorageKind StorageKind; - typedef typename internal::traits<Self>::Index Index; - typedef Scalar_ Scalar; - typedef typename NumTraits<Scalar>::Real RealScalar; - typedef typename Base::CoeffReturnType CoeffReturnType; - - static const int Options = Options_; - - enum { - IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0), - PacketAccess = (internal::packet_traits<Scalar>::size > 1), - BlockAccess = false, - PreferBlockAccess = false, - Layout = Options_ & RowMajor ? RowMajor : ColMajor, - CoordAccess = true, - RawAccess = true - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - typedef Dimensions_ Dimensions; - static const std::size_t NumIndices = Dimensions::count; - - protected: - TensorStorage<Scalar, Dimensions, Options> m_storage; - - public: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } - - // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED - // work, because that uses base().coeffRef() - and we don't yet - // implement a similar class hierarchy - inline Self& base() { return *this; } - inline const Self& base() const { return *this; } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template<typename... IndexTypes> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeff(array<Index, NumIndices>{{firstIndex, otherIndices...}}); - } -#endif - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const - { - eigen_internal_assert(checkIndexRange(indices)); - return m_storage.data()[linearizedIndex(indices)]; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const - { - eigen_internal_assert(index >= 0 && index < size()); - return m_storage.data()[index]; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& coeff() const - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return m_storage.data()[0]; - } - - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template<typename... IndexTypes> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeffRef(array<Index, NumIndices>{{firstIndex, otherIndices...}}); - } -#endif - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices) - { - eigen_internal_assert(checkIndexRange(indices)); - return m_storage.data()[linearizedIndex(indices)]; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) - { - eigen_internal_assert(index >= 0 && index < size()); - return m_storage.data()[index]; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& coeffRef() - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return m_storage.data()[0]; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template<typename... IndexTypes> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return this->operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}}); - } -#else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const - { - if (Options&RowMajor) { - const Index index = i1 + i0 * m_storage.dimensions()[1]; - return m_storage.data()[index]; - } else { - const Index index = i0 + i1 * m_storage.dimensions()[0]; - return m_storage.data()[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const - { - if (Options&RowMajor) { - const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0); - return m_storage.data()[index]; - } else { - const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2); - return m_storage.data()[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const - { - if (Options&RowMajor) { - const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)); - return m_storage.data()[index]; - } else { - const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3)); - return m_storage.data()[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const - { - if (Options&RowMajor) { - const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0))); - return m_storage.data()[index]; - } else { - const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4))); - return m_storage.data()[index]; - } - } -#endif - - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const - { - eigen_assert(checkIndexRange(indices)); - return coeff(indices); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const - { - eigen_internal_assert(index >= 0 && index < size()); - return coeff(index); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()() const - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return coeff(); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const - { - // The bracket operator is only for vectors, use the parenthesis operator instead. - EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); - return coeff(index); - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template<typename... IndexTypes> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}}); - } -#else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) - { - if (Options&RowMajor) { - const Index index = i1 + i0 * m_storage.dimensions()[1]; - return m_storage.data()[index]; - } else { - const Index index = i0 + i1 * m_storage.dimensions()[0]; - return m_storage.data()[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) - { - if (Options&RowMajor) { - const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0); - return m_storage.data()[index]; - } else { - const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2); - return m_storage.data()[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) - { - if (Options&RowMajor) { - const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)); - return m_storage.data()[index]; - } else { - const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3)); - return m_storage.data()[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) - { - if (Options&RowMajor) { - const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0))); - return m_storage.data()[index]; - } else { - const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4))); - return m_storage.data()[index]; - } - } -#endif - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices) - { - eigen_assert(checkIndexRange(indices)); - return coeffRef(indices); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index index) - { - eigen_assert(index >= 0 && index < size()); - return coeffRef(index); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()() - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return coeffRef(); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator[](Index index) - { - // The bracket operator is only for vectors, use the parenthesis operator instead - EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeffRef(index); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorFixedSize() - : m_storage() - { - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorFixedSize(const Self& other) - : m_storage(other.m_storage) - { - } - -#if EIGEN_HAS_RVALUE_REFERENCES - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other) - : m_storage(other.m_storage) - { - } -#endif - - template<typename OtherDerived> - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) - { - typedef TensorAssignOp<TensorFixedSize, const OtherDerived> Assign; - Assign assign(*this, other.derived()); - internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); - } - template<typename OtherDerived> - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase<OtherDerived, WriteAccessors>& other) - { - typedef TensorAssignOp<TensorFixedSize, const OtherDerived> Assign; - Assign assign(*this, other.derived()); - internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice()); - } - - // FIXME: check that the dimensions of other match the dimensions of *this. - // Unfortunately this isn't possible yet when the rhs is an expression. - EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(TensorFixedSize) - - - protected: - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE bool checkIndexRange(const array<Index, NumIndices>& /*indices*/) const - { - using internal::array_apply_and_reduce; - using internal::array_zip_and_reduce; - using internal::greater_equal_zero_op; - using internal::logical_and_op; - using internal::lesser_op; - - return true; - // check whether the indices are all >= 0 - /* array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) && - // check whether the indices fit in the dimensions - array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());*/ - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const - { - if (Options&RowMajor) { - return m_storage.dimensions().IndexOfRowMajor(indices); - } else { - return m_storage.dimensions().IndexOfColMajor(indices); - } - } -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorForcedEval.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorForcedEval.h deleted file mode 100644 index e800ded..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorForcedEval.h +++ /dev/null @@ -1,237 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H -#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H - -namespace Eigen { - -/** \class TensorForcedEval - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor reshaping class. - * - * - */ -namespace internal { -template<typename XprType> -struct traits<TensorForcedEvalOp<XprType> > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename XprType::Scalar Scalar; - typedef traits<XprType> XprTraits; - typedef typename traits<XprType>::StorageKind StorageKind; - typedef typename traits<XprType>::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; - - enum { - Flags = 0 - }; -}; - -template<typename XprType> -struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense> -{ - typedef const TensorForcedEvalOp<XprType>& type; -}; - -template<typename XprType> -struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type> -{ - typedef TensorForcedEvalOp<XprType> type; -}; - -} // end namespace internal - - - -template<typename XprType> -class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; - typedef typename Eigen::internal::nested<TensorForcedEvalOp>::type Nested; - typedef typename Eigen::internal::traits<TensorForcedEvalOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr) - : m_xpr(expr) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; -}; - -namespace internal { -template <typename Device, typename CoeffReturnType> -struct non_integral_type_placement_new{ - template <typename StorageType> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index numValues, StorageType m_buffer) { - // Initialize non-trivially constructible types. - if (!internal::is_arithmetic<CoeffReturnType>::value) { - for (Index i = 0; i < numValues; ++i) new (m_buffer + i) CoeffReturnType(); - } -} -}; - -// SYCL does not support non-integral types -// having new (m_buffer + i) CoeffReturnType() causes the following compiler error for SYCL Devices -// no matching function for call to 'operator new' -template <typename CoeffReturnType> -struct non_integral_type_placement_new<Eigen::SyclDevice, CoeffReturnType> { - template <typename StorageType> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index, StorageType) { -} -}; -} // end namespace internal - -template<typename ArgType_, typename Device> -struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device> -{ - typedef const typename internal::remove_all<ArgType_>::type ArgType; - typedef TensorForcedEvalOp<ArgType> XprType; - typedef typename ArgType::Scalar Scalar; - typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - IsAligned = true, - PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), - BlockAccess = internal::is_arithmetic<CoeffReturnType>::value, - PreferBlockAccess = false, - Layout = TensorEvaluator<ArgType, Device>::Layout, - RawAccess = true - }; - - static const int NumDims = internal::traits<ArgType>::NumDimensions; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; - - typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims, - Layout, Index> - TensorBlock; - //===--------------------------------------------------------------------===// - - TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_op(op.expression()), - m_device(device), m_buffer(NULL) - { } - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { - const Index numValues = internal::array_prod(m_impl.dimensions()); - m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType))); - - internal::non_integral_type_placement_new<Device, CoeffReturnType>()(numValues, m_buffer); - - typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo; - EvalTo evalToTmp(m_device.get(m_buffer), m_op); - - internal::TensorExecutor< - const EvalTo, typename internal::remove_const<Device>::type, - /*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value, - /*Tiling=*/internal::IsTileable<Device, const ArgType>::value>:: - run(evalToTmp, m_device); - - return true; - } - -#ifdef EIGEN_USE_THREADS - template <typename EvalSubExprsCallback> - EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType, EvalSubExprsCallback done) { - const Index numValues = internal::array_prod(m_impl.dimensions()); - m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp( - numValues * sizeof(CoeffReturnType))); - typedef TensorEvalToOp<const typename internal::remove_const<ArgType>::type> - EvalTo; - EvalTo evalToTmp(m_device.get(m_buffer), m_op); - - auto on_done = std::bind([](EvalSubExprsCallback done_) { done_(true); }, - std::move(done)); - internal::TensorAsyncExecutor< - const EvalTo, typename internal::remove_const<Device>::type, - decltype(on_done), - /*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value, - /*Tiling=*/internal::IsTileable<Device, const ArgType>::value>:: - runAsync(evalToTmp, m_device, std::move(on_done)); - } -#endif - - EIGEN_STRONG_INLINE void cleanup() { - m_device.deallocate_temp(m_buffer); - m_buffer = NULL; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_buffer[index]; - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - return internal::TensorBlockResourceRequirements::any(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - assert(m_buffer != NULL); - return TensorBlock::materialize(m_buffer, m_impl.dimensions(), desc, scratch); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - EvaluatorPointerType data() const { return m_buffer; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_buffer.bind(cgh); - m_impl.bind(cgh); - } -#endif - private: - TensorEvaluator<ArgType, Device> m_impl; - const ArgType m_op; - const Device EIGEN_DEVICE_REF m_device; - EvaluatorPointerType m_buffer; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorForwardDeclarations.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorForwardDeclarations.h deleted file mode 100644 index 246ebe4..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorForwardDeclarations.h +++ /dev/null @@ -1,191 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H -#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H - -namespace Eigen { - -// MakePointer class is used as a container of the address space of the pointer -// on the host and on the device. From the host side it generates the T* pointer -// and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to -// T* m_data on the host. It is always called on the device. -// Specialisation of MakePointer class for creating the sycl buffer with -// map_allocator. -template<typename T> struct MakePointer { - typedef T* Type; - typedef const T* ConstType; -}; - -template <typename T> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* constCast(const T* data) { - return const_cast<T*>(data); -} - -// The StorageMemory class is a container of the device specific pointer -// used for refering to a Pointer on TensorEvaluator class. While the TensorExpression -// is a device-agnostic type and need MakePointer class for type conversion, -// the TensorEvaluator class can be specialized for a device, hence it is possible -// to construct different types of temproray storage memory in TensorEvaluator -// for different devices by specializing the following StorageMemory class. -template<typename T, typename device> struct StorageMemory: MakePointer <T> {}; - -namespace internal{ -template<typename A, typename B> struct Pointer_type_promotion { - static const bool val=false; -}; -template<typename A> struct Pointer_type_promotion<A, A> { - static const bool val = true; -}; -template<typename A, typename B> struct TypeConversion { - typedef A* type; -}; -} - - -template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap; -template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor; -template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize; -template<typename PlainObjectType> class TensorRef; -template<typename Derived, int AccessLevel> class TensorBase; - -template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp; -template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp; -template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp; -template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> class TensorCwiseTernaryOp; -template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp; -template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_ = MakePointer > class TensorReductionOp; -template<typename XprType> class TensorIndexTupleOp; -template<typename ReduceOp, typename Dims, typename XprType> class TensorTupleReducerOp; -template<typename Axis, typename LeftXprType, typename RightXprType> class TensorConcatenationOp; -template<typename Dimensions, typename LeftXprType, typename RightXprType, typename OutputKernelType> class TensorContractionOp; -template<typename TargetType, typename XprType> class TensorConversionOp; -template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp; -template<typename FFT, typename XprType, int FFTDataType, int FFTDirection> class TensorFFTOp; -template<typename PatchDim, typename XprType> class TensorPatchOp; -template<DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorImagePatchOp; -template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorVolumePatchOp; -template<typename Broadcast, typename XprType> class TensorBroadcastingOp; -template<DenseIndex DimId, typename XprType> class TensorChippingOp; -template<typename NewDimensions, typename XprType> class TensorReshapingOp; -template<typename XprType> class TensorLayoutSwapOp; -template<typename StartIndices, typename Sizes, typename XprType> class TensorSlicingOp; -template<typename ReverseDimensions, typename XprType> class TensorReverseOp; -template<typename PaddingDimensions, typename XprType> class TensorPaddingOp; -template<typename Shuffle, typename XprType> class TensorShufflingOp; -template<typename Strides, typename XprType> class TensorStridingOp; -template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> class TensorStridingSlicingOp; -template<typename Strides, typename XprType> class TensorInflationOp; -template<typename Generator, typename XprType> class TensorGeneratorOp; -template<typename LeftXprType, typename RightXprType> class TensorAssignOp; -template<typename Op, typename XprType> class TensorScanOp; -template<typename Dims, typename XprType> class TensorTraceOp; - -template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp; -template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp; - -template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorEvalToOp; -template<typename XprType> class TensorForcedEvalOp; - -template<typename ExpressionType, typename DeviceType> class TensorDevice; -template<typename ExpressionType, typename DeviceType, typename DoneCallback> class TensorAsyncDevice; -template<typename Derived, typename Device> struct TensorEvaluator; - -struct NoOpOutputKernel; - -struct DefaultDevice; -struct ThreadPoolDevice; -struct GpuDevice; -struct SyclDevice; - -#ifdef EIGEN_USE_SYCL - -template <typename T> struct MakeSYCLPointer { - typedef Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T> Type; -}; - -template <typename T> -EIGEN_STRONG_INLINE const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>& -constCast(const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>& data) { - return data; -} - -template <typename T> -struct StorageMemory<T, SyclDevice> : MakeSYCLPointer<T> {}; -template <typename T> -struct StorageMemory<T, const SyclDevice> : StorageMemory<T, SyclDevice> {}; - -namespace TensorSycl { -namespace internal{ -template <typename Evaluator, typename Op> class GenericNondeterministicReducer; -} -} -#endif - - -enum FFTResultType { - RealPart = 0, - ImagPart = 1, - BothParts = 2 -}; - -enum FFTDirection { - FFT_FORWARD = 0, - FFT_REVERSE = 1 -}; - - -namespace internal { - -template <typename Device, typename Expression> -struct IsVectorizable { - static const bool value = TensorEvaluator<Expression, Device>::PacketAccess; -}; - -template <typename Expression> -struct IsVectorizable<GpuDevice, Expression> { - static const bool value = TensorEvaluator<Expression, GpuDevice>::PacketAccess && - TensorEvaluator<Expression, GpuDevice>::IsAligned; -}; - -// Tiled evaluation strategy. -enum TiledEvaluation { - Off = 0, // tiled evaluation is not supported - On = 1, // still work in progress (see TensorBlock.h) -}; - -template <typename Device, typename Expression> -struct IsTileable { - // Check that block evaluation is supported and it's a preferred option (at - // least one sub-expression has much faster block evaluation, e.g. - // broadcasting). - static const bool BlockAccess = - TensorEvaluator<Expression, Device>::BlockAccess && - TensorEvaluator<Expression, Device>::PreferBlockAccess; - - static const TiledEvaluation value = - BlockAccess ? TiledEvaluation::On : TiledEvaluation::Off; -}; - -template <typename Expression, typename Device, - bool Vectorizable = IsVectorizable<Device, Expression>::value, - TiledEvaluation Tiling = IsTileable<Device, Expression>::value> -class TensorExecutor; - -template <typename Expression, typename Device, typename DoneCallback, - bool Vectorizable = IsVectorizable<Device, Expression>::value, - TiledEvaluation Tiling = IsTileable<Device, Expression>::value> -class TensorAsyncExecutor; - - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorFunctors.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorFunctors.h deleted file mode 100644 index d963032..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorFunctors.h +++ /dev/null @@ -1,488 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H -#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H - -namespace Eigen { -namespace internal { - - -/** \internal - * \brief Template functor to compute the modulo between an array and a scalar. - */ -template <typename Scalar> -struct scalar_mod_op { - EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a % m_divisor; } - const Scalar m_divisor; -}; -template <typename Scalar> -struct functor_traits<scalar_mod_op<Scalar> > -{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; }; - - -/** \internal - * \brief Template functor to compute the modulo between 2 arrays. - */ -template <typename Scalar> -struct scalar_mod2_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; } -}; -template <typename Scalar> -struct functor_traits<scalar_mod2_op<Scalar> > -{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; }; - -template <typename Scalar> -struct scalar_fmod_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar - operator()(const Scalar& a, const Scalar& b) const { - return numext::fmod(a, b); - } -}; -template <typename Scalar> -struct functor_traits<scalar_fmod_op<Scalar> > { - enum { Cost = 13, // Reciprocal throughput of FPREM on Haswell. - PacketAccess = false }; -}; - -template<typename Reducer, typename Device> -struct reducer_traits { - enum { - Cost = 1, - PacketAccess = false, - IsStateful = false, - IsExactlyAssociative = true - }; -}; - -// Standard reduction functors -template <typename T> struct SumReducer -{ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - internal::scalar_sum_op<T> sum_op; - *accum = sum_op(*accum, t); - } - template <typename Packet> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { - (*accum) = padd<Packet>(*accum, p); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - internal::scalar_cast_op<int, T> conv; - return conv(0); - } - template <typename Packet> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1<Packet>(initialize()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - return accum; - } - template <typename Packet> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { - return vaccum; - } - template <typename Packet> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - internal::scalar_sum_op<T> sum_op; - return sum_op(saccum, predux(vaccum)); - } -}; - -template <typename T, typename Device> -struct reducer_traits<SumReducer<T>, Device> { - enum { - Cost = NumTraits<T>::AddCost, - PacketAccess = PacketType<T, Device>::HasAdd, - IsStateful = false, - IsExactlyAssociative = NumTraits<T>::IsInteger - }; -}; - -template <typename T> struct MeanReducer -{ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - MeanReducer() : scalarCount_(0), packetCount_(0) { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) { - internal::scalar_sum_op<T> sum_op; - *accum = sum_op(*accum, t); - scalarCount_++; - } - template <typename Packet> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) { - (*accum) = padd<Packet>(*accum, p); - packetCount_++; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - internal::scalar_cast_op<int, T> conv; - return conv(0); - } - template <typename Packet> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1<Packet>(initialize()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - internal::scalar_quotient_op<T> quotient_op; - return quotient_op(accum, T(scalarCount_)); - } - template <typename Packet> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { - return pdiv(vaccum, pset1<Packet>(T(packetCount_))); - } - template <typename Packet> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - internal::scalar_sum_op<T> sum_op; - internal::scalar_quotient_op<T> quotient_op; - return quotient_op( - sum_op(saccum, predux(vaccum)), - T(scalarCount_ + packetCount_ * unpacket_traits<Packet>::size)); - } - - protected: - DenseIndex scalarCount_; - DenseIndex packetCount_; -}; - -template <typename T, typename Device> -struct reducer_traits<MeanReducer<T>, Device> { - enum { - Cost = NumTraits<T>::AddCost, - PacketAccess = PacketType<T, Device>::HasAdd && - PacketType<T, Device>::HasDiv && !NumTraits<T>::IsInteger, - IsStateful = true, - IsExactlyAssociative = NumTraits<T>::IsInteger - }; -}; - - -template <typename T, bool IsMax = true, bool IsInteger = true> -struct MinMaxBottomValue { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { - return Eigen::NumTraits<T>::lowest(); - } -}; -template <typename T> -struct MinMaxBottomValue<T, true, false> { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { - return -Eigen::NumTraits<T>::infinity(); - } -}; -template <typename T> -struct MinMaxBottomValue<T, false, true> { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { - return Eigen::NumTraits<T>::highest(); - } -}; -template <typename T> -struct MinMaxBottomValue<T, false, false> { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { - return Eigen::NumTraits<T>::infinity(); - } -}; - - -template <typename T, int NaNPropagation=PropagateFast> struct MaxReducer -{ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - scalar_max_op<T, T, NaNPropagation> op; - *accum = op(t, *accum); - } - template <typename Packet> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { - scalar_max_op<T, T, NaNPropagation> op; - (*accum) = op.packetOp(*accum, p); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return MinMaxBottomValue<T, /*IsMax=*/true, Eigen::NumTraits<T>::IsInteger>::bottom_value(); - } - template <typename Packet> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1<Packet>(initialize()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - return accum; - } - template <typename Packet> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { - return vaccum; - } - template <typename Packet> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - scalar_max_op<T, T, NaNPropagation> op; - return op(saccum, op.predux(vaccum)); - } -}; - -template <typename T, typename Device, int NaNPropagation> - struct reducer_traits<MaxReducer<T, NaNPropagation>, Device> { - enum { - Cost = NumTraits<T>::AddCost, - PacketAccess = PacketType<T, Device>::HasMax, - IsStateful = false, - IsExactlyAssociative = (NaNPropagation!=PropagateFast) - }; -}; - -template <typename T, int NaNPropagation=PropagateFast> struct MinReducer -{ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - scalar_min_op<T, T, NaNPropagation> op; - *accum = op(t, *accum); - } - template <typename Packet> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { - scalar_min_op<T, T, NaNPropagation> op; - (*accum) = op.packetOp(*accum, p); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return MinMaxBottomValue<T, /*IsMax=*/false, Eigen::NumTraits<T>::IsInteger>::bottom_value(); - } - template <typename Packet> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1<Packet>(initialize()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - return accum; - } - template <typename Packet> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { - return vaccum; - } - template <typename Packet> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - scalar_min_op<T, T, NaNPropagation> op; - return op(saccum, op.predux(vaccum)); - } -}; - -template <typename T, typename Device, int NaNPropagation> - struct reducer_traits<MinReducer<T, NaNPropagation>, Device> { - enum { - Cost = NumTraits<T>::AddCost, - PacketAccess = PacketType<T, Device>::HasMin, - IsStateful = false, - IsExactlyAssociative = (NaNPropagation!=PropagateFast) - }; -}; - -template <typename T> struct ProdReducer -{ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - internal::scalar_product_op<T> prod_op; - (*accum) = prod_op(*accum, t); - } - template <typename Packet> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { - (*accum) = pmul<Packet>(*accum, p); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - internal::scalar_cast_op<int, T> conv; - return conv(1); - } - template <typename Packet> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1<Packet>(initialize()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - return accum; - } - template <typename Packet> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { - return vaccum; - } - template <typename Packet> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - internal::scalar_product_op<T> prod_op; - return prod_op(saccum, predux_mul(vaccum)); - } -}; - -template <typename T, typename Device> -struct reducer_traits<ProdReducer<T>, Device> { - enum { - Cost = NumTraits<T>::MulCost, - PacketAccess = PacketType<T, Device>::HasMul, - IsStateful = false, - IsExactlyAssociative = true - }; -}; - - -struct AndReducer -{ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { - *accum = *accum && t; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { - return accum; - } -}; - -template <typename Device> -struct reducer_traits<AndReducer, Device> { - enum { - Cost = 1, - PacketAccess = false, - IsStateful = false, - IsExactlyAssociative = true - }; -}; - - -struct OrReducer { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { - *accum = *accum || t; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { - return false; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { - return accum; - } -}; - -template <typename Device> -struct reducer_traits<OrReducer, Device> { - enum { - Cost = 1, - PacketAccess = false, - IsStateful = false, - IsExactlyAssociative = true - }; -}; - -// Argmin/Argmax reducers. Returns the first occurrence if multiple locations -// contain the same min/max value. -template <typename T> struct ArgMaxTupleReducer -{ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - if (t.second < accum->second) { - return; - } else if (t.second > accum->second || accum->first > t.first ) { - *accum = t; - } - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return T(0, NumTraits<typename T::second_type>::lowest()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { - return accum; - } -}; - -template <typename T, typename Device> -struct reducer_traits<ArgMaxTupleReducer<T>, Device> { - enum { - Cost = NumTraits<T>::AddCost, - PacketAccess = false, - IsStateful = false, - IsExactlyAssociative = true - }; -}; - - -template <typename T> struct ArgMinTupleReducer -{ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const { - if (t.second > accum->second) { - return; - } else if (t.second < accum->second || accum->first > t.first) { - *accum = t; - } - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return T(0, NumTraits<typename T::second_type>::highest()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { - return accum; - } -}; - -template <typename T, typename Device> -struct reducer_traits<ArgMinTupleReducer<T>, Device> { - enum { - Cost = NumTraits<T>::AddCost, - PacketAccess = false, - IsStateful = false, - IsExactlyAssociative = true - }; -}; - - -template <typename T, typename Index, size_t NumDims> -class GaussianGenerator { - public: - static const bool PacketAccess = false; - - EIGEN_DEVICE_FUNC GaussianGenerator(const array<T, NumDims>& means, - const array<T, NumDims>& std_devs) - : m_means(means) - { - EIGEN_UNROLL_LOOP - for (size_t i = 0; i < NumDims; ++i) { - m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2; - } - } - - EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const { - T tmp = T(0); - EIGEN_UNROLL_LOOP - for (size_t i = 0; i < NumDims; ++i) { - T offset = coordinates[i] - m_means[i]; - tmp += offset * offset / m_two_sigmas[i]; - } - return numext::exp(-tmp); - } - - private: - array<T, NumDims> m_means; - array<T, NumDims> m_two_sigmas; -}; - -template <typename T, typename Index, size_t NumDims> -struct functor_traits<GaussianGenerator<T, Index, NumDims> > { - enum { - Cost = NumDims * (2 * NumTraits<T>::AddCost + NumTraits<T>::MulCost + - functor_traits<scalar_quotient_op<T, T> >::Cost) + - functor_traits<scalar_exp_op<T> >::Cost, - PacketAccess = GaussianGenerator<T, Index, NumDims>::PacketAccess - }; -}; - -template <typename Scalar> -struct scalar_clamp_op { - EIGEN_DEVICE_FUNC inline scalar_clamp_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar - operator()(const Scalar& x) const { - return numext::mini(numext::maxi(x, m_min), m_max); - } - template <typename Packet> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet - packetOp(const Packet& x) const { - return internal::pmin(internal::pmax(x, pset1<Packet>(m_min)), pset1<Packet>(m_max)); - } - const Scalar m_min; - const Scalar m_max; -}; -template<typename Scalar> -struct functor_traits<scalar_clamp_op<Scalar> > -{ enum { Cost = 2 * NumTraits<Scalar>::AddCost, PacketAccess = (packet_traits<Scalar>::HasMin && packet_traits<Scalar>::HasMax)}; }; - -} // end namespace internal -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorGenerator.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorGenerator.h deleted file mode 100644 index 174bf06..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorGenerator.h +++ /dev/null @@ -1,302 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H -#define EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H - -namespace Eigen { - -/** \class TensorGeneratorOp - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor generator class. - * - * - */ -namespace internal { -template<typename Generator, typename XprType> -struct traits<TensorGeneratorOp<Generator, XprType> > : public traits<XprType> -{ - typedef typename XprType::Scalar Scalar; - typedef traits<XprType> XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; -}; - -template<typename Generator, typename XprType> -struct eval<TensorGeneratorOp<Generator, XprType>, Eigen::Dense> -{ - typedef const TensorGeneratorOp<Generator, XprType>& type; -}; - -template<typename Generator, typename XprType> -struct nested<TensorGeneratorOp<Generator, XprType>, 1, typename eval<TensorGeneratorOp<Generator, XprType> >::type> -{ - typedef TensorGeneratorOp<Generator, XprType> type; -}; - -} // end namespace internal - - - -template<typename Generator, typename XprType> -class TensorGeneratorOp : public TensorBase<TensorGeneratorOp<Generator, XprType>, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits<TensorGeneratorOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested<TensorGeneratorOp>::type Nested; - typedef typename Eigen::internal::traits<TensorGeneratorOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorGeneratorOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorGeneratorOp(const XprType& expr, const Generator& generator) - : m_xpr(expr), m_generator(generator) {} - - EIGEN_DEVICE_FUNC - const Generator& generator() const { return m_generator; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const Generator m_generator; -}; - - -// Eval as rvalue -template<typename Generator, typename ArgType, typename Device> -struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> -{ - typedef TensorGeneratorOp<Generator, ArgType> XprType; - typedef typename XprType::Index Index; - typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; - static const int NumDims = internal::array_size<Dimensions>::value; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - enum { - IsAligned = false, - PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), - BlockAccess = true, - PreferBlockAccess = true, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - typedef internal::TensorIntDivisor<Index> IndexDivisor; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; - - typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims, - Layout, Index> - TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_device(device), m_generator(op.generator()) - { - TensorEvaluator<ArgType, Device> argImpl(op.expression(), device); - m_dimensions = argImpl.dimensions(); - - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - m_strides[0] = 1; - EIGEN_UNROLL_LOOP - for (int i = 1; i < NumDims; ++i) { - m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; - if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); - } - } else { - m_strides[NumDims - 1] = 1; - EIGEN_UNROLL_LOOP - for (int i = NumDims - 2; i >= 0; --i) { - m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; - if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { - return true; - } - EIGEN_STRONG_INLINE void cleanup() { - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - array<Index, NumDims> coords; - extract_coordinates(index, coords); - return m_generator(coords); - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - const int packetSize = PacketType<CoeffReturnType, Device>::size; - EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); - - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload<PacketReturnType>(values); - return rslt; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - const size_t target_size = m_device.firstLevelCacheSize(); - // TODO(ezhulenev): Generator should have a cost. - return internal::TensorBlockResourceRequirements::skewed<Scalar>( - target_size); - } - - struct BlockIteratorState { - Index stride; - Index span; - Index size; - Index count; - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - static const bool is_col_major = - static_cast<int>(Layout) == static_cast<int>(ColMajor); - - // Compute spatial coordinates for the first block element. - array<Index, NumDims> coords; - extract_coordinates(desc.offset(), coords); - array<Index, NumDims> initial_coords = coords; - - // Offset in the output block buffer. - Index offset = 0; - - // Initialize output block iterator state. Dimension in this array are - // always in inner_most -> outer_most order (col major layout). - array<BlockIteratorState, NumDims> it; - for (int i = 0; i < NumDims; ++i) { - const int dim = is_col_major ? i : NumDims - 1 - i; - it[i].size = desc.dimension(dim); - it[i].stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].stride); - it[i].span = it[i].stride * (it[i].size - 1); - it[i].count = 0; - } - eigen_assert(it[0].stride == 1); - - // Prepare storage for the materialized generator result. - const typename TensorBlock::Storage block_storage = - TensorBlock::prepareStorage(desc, scratch); - - CoeffReturnType* block_buffer = block_storage.data(); - - static const int packet_size = PacketType<CoeffReturnType, Device>::size; - - static const int inner_dim = is_col_major ? 0 : NumDims - 1; - const Index inner_dim_size = it[0].size; - const Index inner_dim_vectorized = inner_dim_size - packet_size; - - while (it[NumDims - 1].count < it[NumDims - 1].size) { - Index i = 0; - // Generate data for the vectorized part of the inner-most dimension. - for (; i <= inner_dim_vectorized; i += packet_size) { - for (Index j = 0; j < packet_size; ++j) { - array<Index, NumDims> j_coords = coords; // Break loop dependence. - j_coords[inner_dim] += j; - *(block_buffer + offset + i + j) = m_generator(j_coords); - } - coords[inner_dim] += packet_size; - } - // Finalize non-vectorized part of the inner-most dimension. - for (; i < inner_dim_size; ++i) { - *(block_buffer + offset + i) = m_generator(coords); - coords[inner_dim]++; - } - coords[inner_dim] = initial_coords[inner_dim]; - - // For the 1d tensor we need to generate only one inner-most dimension. - if (NumDims == 1) break; - - // Update offset. - for (i = 1; i < NumDims; ++i) { - if (++it[i].count < it[i].size) { - offset += it[i].stride; - coords[is_col_major ? i : NumDims - 1 - i]++; - break; - } - if (i != NumDims - 1) it[i].count = 0; - coords[is_col_major ? i : NumDims - 1 - i] = - initial_coords[is_col_major ? i : NumDims - 1 - i]; - offset -= it[i].span; - } - } - - return block_storage.AsTensorMaterializedBlock(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool) const { - // TODO(rmlarsen): This is just a placeholder. Define interface to make - // generators return their cost. - return TensorOpCost(0, 0, TensorOpCost::AddCost<Scalar>() + - TensorOpCost::MulCost<Scalar>()); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler&) const {} -#endif - - protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void extract_coordinates(Index index, array<Index, NumDims>& coords) const { - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_fast_strides[i]; - index -= idx * m_strides[i]; - coords[i] = idx; - } - coords[0] = index; - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_fast_strides[i]; - index -= idx * m_strides[i]; - coords[i] = idx; - } - coords[NumDims-1] = index; - } - } - - const Device EIGEN_DEVICE_REF m_device; - Dimensions m_dimensions; - array<Index, NumDims> m_strides; - array<IndexDivisor, NumDims> m_fast_strides; - Generator m_generator; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorGlobalFunctions.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorGlobalFunctions.h deleted file mode 100644 index 665b861..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorGlobalFunctions.h +++ /dev/null @@ -1,33 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H -#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H - -namespace Eigen { - -/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors. - * - * This function computes the regularized incomplete beta function (integral). - * - */ -template <typename ADerived, typename BDerived, typename XDerived> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const - TensorCwiseTernaryOp<internal::scalar_betainc_op<typename XDerived::Scalar>, - const ADerived, const BDerived, const XDerived> - betainc(const ADerived& a, const BDerived& b, const XDerived& x) { - return TensorCwiseTernaryOp< - internal::scalar_betainc_op<typename XDerived::Scalar>, const ADerived, - const BDerived, const XDerived>( - a, b, x, internal::scalar_betainc_op<typename XDerived::Scalar>()); -} - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaDefines.h deleted file mode 100644 index cb53ce2..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaDefines.h +++ /dev/null @@ -1,99 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H) -#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H - -// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design -// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU, but -// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler -// When compiling such files, gcc will end up trying to pick up the CUDA headers by -// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU) -// This will obviously not work when trying to compile tensorflow on a system with no CUDA -// To work around this issue for HIP systems (and leave the default behaviour intact), the -// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and -// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is -// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well - -#if defined(EIGEN_USE_HIP) - -#define gpuStream_t hipStream_t -#define gpuDeviceProp_t hipDeviceProp_t -#define gpuError_t hipError_t -#define gpuSuccess hipSuccess -#define gpuErrorNotReady hipErrorNotReady -#define gpuGetDeviceCount hipGetDeviceCount -#define gpuGetLastError hipGetLastError -#define gpuPeekAtLastError hipPeekAtLastError -#define gpuGetErrorName hipGetErrorName -#define gpuGetErrorString hipGetErrorString -#define gpuGetDeviceProperties hipGetDeviceProperties -#define gpuStreamDefault hipStreamDefault -#define gpuGetDevice hipGetDevice -#define gpuSetDevice hipSetDevice -#define gpuMalloc hipMalloc -#define gpuFree hipFree -#define gpuMemsetAsync hipMemsetAsync -#define gpuMemcpyAsync hipMemcpyAsync -#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice -#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost -#define gpuMemcpyHostToDevice hipMemcpyHostToDevice -#define gpuStreamQuery hipStreamQuery -#define gpuSharedMemConfig hipSharedMemConfig -#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig -#define gpuStreamSynchronize hipStreamSynchronize -#define gpuDeviceSynchronize hipDeviceSynchronize -#define gpuMemcpy hipMemcpy - -#else - -#define gpuStream_t cudaStream_t -#define gpuDeviceProp_t cudaDeviceProp -#define gpuError_t cudaError_t -#define gpuSuccess cudaSuccess -#define gpuErrorNotReady cudaErrorNotReady -#define gpuGetDeviceCount cudaGetDeviceCount -#define gpuGetLastError cudaGetLastError -#define gpuPeekAtLastError cudaPeekAtLastError -#define gpuGetErrorName cudaGetErrorName -#define gpuGetErrorString cudaGetErrorString -#define gpuGetDeviceProperties cudaGetDeviceProperties -#define gpuStreamDefault cudaStreamDefault -#define gpuGetDevice cudaGetDevice -#define gpuSetDevice cudaSetDevice -#define gpuMalloc cudaMalloc -#define gpuFree cudaFree -#define gpuMemsetAsync cudaMemsetAsync -#define gpuMemcpyAsync cudaMemcpyAsync -#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice -#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost -#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice -#define gpuStreamQuery cudaStreamQuery -#define gpuSharedMemConfig cudaSharedMemConfig -#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig -#define gpuStreamSynchronize cudaStreamSynchronize -#define gpuDeviceSynchronize cudaDeviceSynchronize -#define gpuMemcpy cudaMemcpy - -#endif - -// gpu_assert can be overridden -#ifndef gpu_assert - -#if defined(EIGEN_HIP_DEVICE_COMPILE) -// HIPCC do not support the use of assert on the GPU side. -#define gpu_assert(COND) -#else -#define gpu_assert(COND) assert(COND) -#endif - -#endif // gpu_assert - -#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h deleted file mode 100644 index 1d142f2..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h +++ /dev/null @@ -1,44 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H) - -#ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES - -#undef gpuStream_t -#undef gpuDeviceProp_t -#undef gpuError_t -#undef gpuSuccess -#undef gpuErrorNotReady -#undef gpuGetDeviceCount -#undef gpuGetErrorString -#undef gpuGetDeviceProperties -#undef gpuStreamDefault -#undef gpuGetDevice -#undef gpuSetDevice -#undef gpuMalloc -#undef gpuFree -#undef gpuMemsetAsync -#undef gpuMemcpyAsync -#undef gpuMemcpyDeviceToDevice -#undef gpuMemcpyDeviceToHost -#undef gpuMemcpyHostToDevice -#undef gpuStreamQuery -#undef gpuSharedMemConfig -#undef gpuDeviceSetSharedMemConfig -#undef gpuStreamSynchronize -#undef gpuDeviceSynchronize -#undef gpuMemcpy - -#endif // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES - -#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H - -#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorIO.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorIO.h deleted file mode 100644 index a901c5d..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorIO.h +++ /dev/null @@ -1,79 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H -#define EIGEN_CXX11_TENSOR_TENSOR_IO_H - -namespace Eigen { - -namespace internal { - -// Print the tensor as a 2d matrix -template <typename Tensor, int Rank> -struct TensorPrinter { - static void run (std::ostream& os, const Tensor& tensor) { - typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar; - typedef typename Tensor::Index Index; - const Index total_size = internal::array_prod(tensor.dimensions()); - if (total_size > 0) { - const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions()); - static const int layout = Tensor::Layout; - Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim); - os << matrix; - } - } -}; - - -// Print the tensor as a vector -template <typename Tensor> -struct TensorPrinter<Tensor, 1> { - static void run (std::ostream& os, const Tensor& tensor) { - typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar; - typedef typename Tensor::Index Index; - const Index total_size = internal::array_prod(tensor.dimensions()); - if (total_size > 0) { - Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size); - os << array; - } - } -}; - - -// Print the tensor as a scalar -template <typename Tensor> -struct TensorPrinter<Tensor, 0> { - static void run (std::ostream& os, const Tensor& tensor) { - os << tensor.coeff(0); - } -}; -} - -template <typename T> -std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) { - typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator; - typedef typename Evaluator::Dimensions Dimensions; - - // Evaluate the expression if needed - TensorForcedEvalOp<const T> eval = expr.eval(); - Evaluator tensor(eval, DefaultDevice()); - tensor.evalSubExprsIfNeeded(NULL); - - // Print the result - static const int rank = internal::array_size<Dimensions>::value; - internal::TensorPrinter<Evaluator, rank>::run(os, tensor); - - // Cleanup. - tensor.cleanup(); - return os; -} - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorImagePatch.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorImagePatch.h deleted file mode 100644 index dd51850..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorImagePatch.h +++ /dev/null @@ -1,603 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H -#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H - -namespace Eigen { - -/** \class TensorImagePatch - * \ingroup CXX11_Tensor_Module - * - * \brief Patch extraction specialized for image processing. - * This assumes that the input has a least 3 dimensions ordered as follow: - * 1st dimension: channels (of size d) - * 2nd dimension: rows (of size r) - * 3rd dimension: columns (of size c) - * There can be additional dimensions such as time (for video) or batch (for - * bulk processing after the first 3. - * Calling the image patch code with patch_rows and patch_cols is equivalent - * to calling the regular patch extraction code with parameters d, patch_rows, - * patch_cols, and 1 for all the additional dimensions. - */ -namespace internal { - -template<DenseIndex Rows, DenseIndex Cols, typename XprType> -struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType> -{ - typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; - typedef traits<XprType> XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions + 1; - static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; -}; - -template<DenseIndex Rows, DenseIndex Cols, typename XprType> -struct eval<TensorImagePatchOp<Rows, Cols, XprType>, Eigen::Dense> -{ - typedef const TensorImagePatchOp<Rows, Cols, XprType>& type; -}; - -template<DenseIndex Rows, DenseIndex Cols, typename XprType> -struct nested<TensorImagePatchOp<Rows, Cols, XprType>, 1, typename eval<TensorImagePatchOp<Rows, Cols, XprType> >::type> -{ - typedef TensorImagePatchOp<Rows, Cols, XprType> type; -}; - -template <typename Self, bool Vectorizable> -struct ImagePatchCopyOp { - typedef typename Self::Index Index; - typedef typename Self::Scalar Scalar; - typedef typename Self::Impl Impl; - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const Self& self, const Index num_coeff_to_copy, const Index dst_index, - Scalar* dst_data, const Index src_index) { - const Impl& impl = self.impl(); - for (Index i = 0; i < num_coeff_to_copy; ++i) { - dst_data[dst_index + i] = impl.coeff(src_index + i); - } - } -}; - -template <typename Self> -struct ImagePatchCopyOp<Self, true> { - typedef typename Self::Index Index; - typedef typename Self::Scalar Scalar; - typedef typename Self::Impl Impl; - typedef typename packet_traits<Scalar>::type Packet; - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const Self& self, const Index num_coeff_to_copy, const Index dst_index, - Scalar* dst_data, const Index src_index) { - const Impl& impl = self.impl(); - const Index packet_size = internal::unpacket_traits<Packet>::size; - const Index vectorized_size = - (num_coeff_to_copy / packet_size) * packet_size; - for (Index i = 0; i < vectorized_size; i += packet_size) { - Packet p = impl.template packet<Unaligned>(src_index + i); - internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i, p); - } - for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { - dst_data[dst_index + i] = impl.coeff(src_index + i); - } - } -}; - -template <typename Self> -struct ImagePatchPaddingOp { - typedef typename Self::Index Index; - typedef typename Self::Scalar Scalar; - typedef typename packet_traits<Scalar>::type Packet; - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const Index num_coeff_to_pad, const Scalar padding_value, - const Index dst_index, Scalar* dst_data) { - const Index packet_size = internal::unpacket_traits<Packet>::size; - const Packet padded_packet = internal::pset1<Packet>(padding_value); - const Index vectorized_size = - (num_coeff_to_pad / packet_size) * packet_size; - for (Index i = 0; i < vectorized_size; i += packet_size) { - internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i, - padded_packet); - } - for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) { - dst_data[dst_index + i] = padding_value; - } - } -}; - -} // end namespace internal - -template<DenseIndex Rows, DenseIndex Cols, typename XprType> -class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprType>, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits<TensorImagePatchOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested<TensorImagePatchOp>::type Nested; - typedef typename Eigen::internal::traits<TensorImagePatchOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorImagePatchOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, - DenseIndex row_strides, DenseIndex col_strides, - DenseIndex in_row_strides, DenseIndex in_col_strides, - DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, - PaddingType padding_type, Scalar padding_value) - : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), - m_row_strides(row_strides), m_col_strides(col_strides), - m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), - m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), - m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0), - m_padding_type(padding_type), m_padding_value(padding_value) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, - DenseIndex row_strides, DenseIndex col_strides, - DenseIndex in_row_strides, DenseIndex in_col_strides, - DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, - DenseIndex padding_top, DenseIndex padding_bottom, - DenseIndex padding_left, DenseIndex padding_right, - Scalar padding_value) - : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), - m_row_strides(row_strides), m_col_strides(col_strides), - m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), - m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), - m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom), - m_padding_left(padding_left), m_padding_right(padding_right), - m_padding_type(PADDING_VALID), m_padding_value(padding_value) {} - - - EIGEN_DEVICE_FUNC - DenseIndex patch_rows() const { return m_patch_rows; } - EIGEN_DEVICE_FUNC - DenseIndex patch_cols() const { return m_patch_cols; } - EIGEN_DEVICE_FUNC - DenseIndex row_strides() const { return m_row_strides; } - EIGEN_DEVICE_FUNC - DenseIndex col_strides() const { return m_col_strides; } - EIGEN_DEVICE_FUNC - DenseIndex in_row_strides() const { return m_in_row_strides; } - EIGEN_DEVICE_FUNC - DenseIndex in_col_strides() const { return m_in_col_strides; } - EIGEN_DEVICE_FUNC - DenseIndex row_inflate_strides() const { return m_row_inflate_strides; } - EIGEN_DEVICE_FUNC - DenseIndex col_inflate_strides() const { return m_col_inflate_strides; } - EIGEN_DEVICE_FUNC - bool padding_explicit() const { return m_padding_explicit; } - EIGEN_DEVICE_FUNC - DenseIndex padding_top() const { return m_padding_top; } - EIGEN_DEVICE_FUNC - DenseIndex padding_bottom() const { return m_padding_bottom; } - EIGEN_DEVICE_FUNC - DenseIndex padding_left() const { return m_padding_left; } - EIGEN_DEVICE_FUNC - DenseIndex padding_right() const { return m_padding_right; } - EIGEN_DEVICE_FUNC - PaddingType padding_type() const { return m_padding_type; } - EIGEN_DEVICE_FUNC - Scalar padding_value() const { return m_padding_value; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const DenseIndex m_patch_rows; - const DenseIndex m_patch_cols; - const DenseIndex m_row_strides; - const DenseIndex m_col_strides; - const DenseIndex m_in_row_strides; - const DenseIndex m_in_col_strides; - const DenseIndex m_row_inflate_strides; - const DenseIndex m_col_inflate_strides; - const bool m_padding_explicit; - const DenseIndex m_padding_top; - const DenseIndex m_padding_bottom; - const DenseIndex m_padding_left; - const DenseIndex m_padding_right; - const PaddingType m_padding_type; - const Scalar m_padding_value; -}; - -// Eval as rvalue -template<DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device> -struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> -{ - typedef TensorImagePatchOp<Rows, Cols, ArgType> XprType; - typedef typename XprType::Index Index; - static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; - static const int NumDims = NumInputDims + 1; - typedef DSizes<Index, NumDims> Dimensions; - typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; - typedef TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, - Device> Self; - typedef TensorEvaluator<ArgType, Device> Impl; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - IsAligned = false, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = false, - PreferBlockAccess = true, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) - : m_device(device), m_impl(op.expression(), device) - { - EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); - - m_paddingValue = op.padding_value(); - - const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); - - // Caches a few variables. - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - m_inputDepth = input_dims[0]; - m_inputRows = input_dims[1]; - m_inputCols = input_dims[2]; - } else { - m_inputDepth = input_dims[NumInputDims-1]; - m_inputRows = input_dims[NumInputDims-2]; - m_inputCols = input_dims[NumInputDims-3]; - } - - m_row_strides = op.row_strides(); - m_col_strides = op.col_strides(); - - // Input strides and effective input/patch size - m_in_row_strides = op.in_row_strides(); - m_in_col_strides = op.in_col_strides(); - m_row_inflate_strides = op.row_inflate_strides(); - m_col_inflate_strides = op.col_inflate_strides(); - // The "effective" input rows and input cols are the input rows and cols - // after inflating them with zeros. - // For examples, a 2x3 matrix with row_inflate_strides and - // col_inflate_strides of 2 comes from: - // A B C - // D E F - // - // to a matrix is 3 x 5: - // - // A . B . C - // . . . . . - // D . E . F - - m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1; - m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1; - m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1); - m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1); - - if (op.padding_explicit()) { - m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides)); - m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides)); - m_rowPaddingTop = op.padding_top(); - m_colPaddingLeft = op.padding_left(); - } else { - // Computing padding from the type - switch (op.padding_type()) { - case PADDING_VALID: - m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides)); - m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides)); - // Calculate the padding - m_rowPaddingTop = numext::maxi<Index>(0, ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2); - m_colPaddingLeft = numext::maxi<Index>(0, ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2); - break; - case PADDING_SAME: - m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides)); - m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides)); - // Calculate the padding - m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2; - m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2; - // The padding size calculation for PADDING_SAME has been updated to - // be consistent with how TensorFlow extracts its paddings. - m_rowPaddingTop = numext::maxi<Index>(0, m_rowPaddingTop); - m_colPaddingLeft = numext::maxi<Index>(0, m_colPaddingLeft); - break; - default: - eigen_assert(false && "unexpected padding"); - m_outputCols=0; // silence the uninitialised warning; - m_outputRows=0; //// silence the uninitialised warning; - } - } - eigen_assert(m_outputRows > 0); - eigen_assert(m_outputCols > 0); - - // Dimensions for result of extraction. - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - // ColMajor - // 0: depth - // 1: patch_rows - // 2: patch_cols - // 3: number of patches - // 4 and beyond: anything else (such as batch). - m_dimensions[0] = input_dims[0]; - m_dimensions[1] = op.patch_rows(); - m_dimensions[2] = op.patch_cols(); - m_dimensions[3] = m_outputRows * m_outputCols; - for (int i = 4; i < NumDims; ++i) { - m_dimensions[i] = input_dims[i-1]; - } - } else { - // RowMajor - // NumDims-1: depth - // NumDims-2: patch_rows - // NumDims-3: patch_cols - // NumDims-4: number of patches - // NumDims-5 and beyond: anything else (such as batch). - m_dimensions[NumDims-1] = input_dims[NumInputDims-1]; - m_dimensions[NumDims-2] = op.patch_rows(); - m_dimensions[NumDims-3] = op.patch_cols(); - m_dimensions[NumDims-4] = m_outputRows * m_outputCols; - for (int i = NumDims-5; i >= 0; --i) { - m_dimensions[i] = input_dims[i]; - } - } - - // Strides for moving the patch in various dimensions. - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - m_colStride = m_dimensions[1]; - m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0]; - m_otherStride = m_patchStride * m_dimensions[3]; - } else { - m_colStride = m_dimensions[NumDims-2]; - m_patchStride = m_colStride * m_dimensions[NumDims-3] * m_dimensions[NumDims-1]; - m_otherStride = m_patchStride * m_dimensions[NumDims-4]; - } - - // Strides for navigating through the input tensor. - m_rowInputStride = m_inputDepth; - m_colInputStride = m_inputDepth * m_inputRows; - m_patchInputStride = m_inputDepth * m_inputRows * m_inputCols; - - // Fast representations of different variables. - m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride); - m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride); - m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride); - m_fastInflateRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides); - m_fastInflateColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides); - m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff); - - // Number of patches in the width dimension. - m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows); - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]); - } else { - m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - -#ifdef EIGEN_USE_THREADS - template <typename EvalSubExprsCallback> - EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType, EvalSubExprsCallback done) { - m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); - } -#endif // EIGEN_USE_THREADS - - EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - // Patch index corresponding to the passed in index. - const Index patchIndex = index / m_fastPatchStride; - // Find the offset of the element wrt the location of the first element. - const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth; - - // Other ways to index this element. - const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride; - const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride; - - // Calculate col index in the input original tensor. - const Index colIndex = patch2DIndex / m_fastOutputRows; - const Index colOffset = patchOffset / m_fastColStride; - const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft; - const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0); - if (inputCol < 0 || inputCol >= m_input_cols_eff || - ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) { - return Scalar(m_paddingValue); - } - - // Calculate row index in the original input tensor. - const Index rowIndex = patch2DIndex - colIndex * m_outputRows; - const Index rowOffset = patchOffset - colOffset * m_colStride; - const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop; - const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride) : 0); - if (inputRow < 0 || inputRow >= m_input_rows_eff || - ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) { - return Scalar(m_paddingValue); - } - - const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1; - const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; - - const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex * m_patchInputStride; - return m_impl.coeff(inputIndex); - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) { - return packetWithPossibleZero(index); - } - - const Index indices[2] = {index, index + PacketSize - 1}; - const Index patchIndex = indices[0] / m_fastPatchStride; - if (patchIndex != indices[1] / m_fastPatchStride) { - return packetWithPossibleZero(index); - } - const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride; - eigen_assert(otherIndex == indices[1] / m_fastOtherStride); - - // Find the offset of the element wrt the location of the first element. - const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth, - (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth}; - - const Index patch2DIndex = (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride; - eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride); - - const Index colIndex = patch2DIndex / m_fastOutputRows; - const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride}; - - // Calculate col indices in the original input tensor. - const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] - - m_colPaddingLeft, colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft}; - if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) { - return internal::pset1<PacketReturnType>(Scalar(m_paddingValue)); - } - - if (inputCols[0] == inputCols[1]) { - const Index rowIndex = patch2DIndex - colIndex * m_outputRows; - const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride}; - eigen_assert(rowOffsets[0] <= rowOffsets[1]); - // Calculate col indices in the original input tensor. - const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] - - m_rowPaddingTop, rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop}; - - if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) { - return internal::pset1<PacketReturnType>(Scalar(m_paddingValue)); - } - - if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) { - // no padding - const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1; - const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; - const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride; - return m_impl.template packet<Unaligned>(inputIndex); - } - } - - return packetWithPossibleZero(index); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - // We conservatively estimate the cost for the code path where the computed - // index is inside the original image and - // TensorEvaluator<ArgType, Device>::CoordAccess is false. - const double compute_cost = 3 * TensorOpCost::DivCost<Index>() + - 6 * TensorOpCost::MulCost<Index>() + - 8 * TensorOpCost::MulCost<Index>(); - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); - } - - protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const - { - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; - EIGEN_UNROLL_LOOP - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload<PacketReturnType>(values); - return rslt; - } - - Dimensions m_dimensions; - - Index m_otherStride; - Index m_patchStride; - Index m_colStride; - Index m_row_strides; - Index m_col_strides; - - Index m_in_row_strides; - Index m_in_col_strides; - Index m_row_inflate_strides; - Index m_col_inflate_strides; - - Index m_input_rows_eff; - Index m_input_cols_eff; - Index m_patch_rows_eff; - Index m_patch_cols_eff; - - internal::TensorIntDivisor<Index> m_fastOtherStride; - internal::TensorIntDivisor<Index> m_fastPatchStride; - internal::TensorIntDivisor<Index> m_fastColStride; - internal::TensorIntDivisor<Index> m_fastInflateRowStride; - internal::TensorIntDivisor<Index> m_fastInflateColStride; - internal::TensorIntDivisor<Index> m_fastInputColsEff; - - Index m_rowInputStride; - Index m_colInputStride; - Index m_patchInputStride; - - Index m_inputDepth; - Index m_inputRows; - Index m_inputCols; - - Index m_outputRows; - Index m_outputCols; - - Index m_rowPaddingTop; - Index m_colPaddingLeft; - - internal::TensorIntDivisor<Index> m_fastOutputRows; - internal::TensorIntDivisor<Index> m_fastOutputDepth; - - Scalar m_paddingValue; - - const Device EIGEN_DEVICE_REF m_device; - TensorEvaluator<ArgType, Device> m_impl; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorIndexList.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorIndexList.h deleted file mode 100644 index 2d8c7b9..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorIndexList.h +++ /dev/null @@ -1,738 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H -#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H - - -#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES - -#define EIGEN_HAS_INDEX_LIST - -namespace Eigen { - -/** \internal - * - * \class TensorIndexList - * \ingroup CXX11_Tensor_Module - * - * \brief Set of classes used to encode a set of Tensor dimensions/indices. - * - * The indices in the list can be known at compile time or at runtime. A mix - * of static and dynamic indices can also be provided if needed. The tensor - * code will attempt to take advantage of the indices that are known at - * compile time to optimize the code it generates. - * - * This functionality requires a c++11 compliant compiler. If your compiler - * is older you need to use arrays of indices instead. - * - * Several examples are provided in the cxx11_tensor_index_list.cpp file. - * - * \sa Tensor - */ - -template <Index n> -struct type2index { - static const Index value = n; - EIGEN_DEVICE_FUNC constexpr operator Index() const { return n; } - EIGEN_DEVICE_FUNC void set(Index val) { - eigen_assert(val == n); - } -}; - -// This can be used with IndexPairList to get compile-time constant pairs, -// such as IndexPairList<type2indexpair<1,2>, type2indexpair<3,4>>(). -template <Index f, Index s> -struct type2indexpair { - static const Index first = f; - static const Index second = s; - - constexpr EIGEN_DEVICE_FUNC operator IndexPair<Index>() const { - return IndexPair<Index>(f, s); - } - - EIGEN_DEVICE_FUNC void set(const IndexPair<Index>& val) { - eigen_assert(val.first == f); - eigen_assert(val.second == s); - } -}; - - -template<Index n> struct NumTraits<type2index<n> > -{ - typedef Index Real; - enum { - IsComplex = 0, - RequireInitialization = false, - ReadCost = 1, - AddCost = 1, - MulCost = 1 - }; - - EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real epsilon() { return 0; } - EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real dummy_precision() { return 0; } - EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real highest() { return n; } - EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real lowest() { return n; } -}; - -namespace internal { -template <typename T> -EIGEN_DEVICE_FUNC void update_value(T& val, Index new_val) { - val = internal::convert_index<T>(new_val); -} -template <Index n> -EIGEN_DEVICE_FUNC void update_value(type2index<n>& val, Index new_val) { - val.set(new_val); -} - -template <typename T> -EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair<Index> new_val) { - val = new_val; -} -template <Index f, Index s> -EIGEN_DEVICE_FUNC void update_value(type2indexpair<f, s>& val, IndexPair<Index> new_val) { - val.set(new_val); -} - - -template <typename T> -struct is_compile_time_constant { - static constexpr bool value = false; -}; - -template <Index idx> -struct is_compile_time_constant<type2index<idx> > { - static constexpr bool value = true; -}; -template <Index idx> -struct is_compile_time_constant<const type2index<idx> > { - static constexpr bool value = true; -}; -template <Index idx> -struct is_compile_time_constant<type2index<idx>& > { - static constexpr bool value = true; -}; -template <Index idx> -struct is_compile_time_constant<const type2index<idx>& > { - static constexpr bool value = true; -}; - -template <Index f, Index s> -struct is_compile_time_constant<type2indexpair<f, s> > { - static constexpr bool value = true; -}; -template <Index f, Index s> -struct is_compile_time_constant<const type2indexpair<f, s> > { - static constexpr bool value = true; -}; -template <Index f, Index s> -struct is_compile_time_constant<type2indexpair<f, s>& > { - static constexpr bool value = true; -}; -template <Index f, Index s> -struct is_compile_time_constant<const type2indexpair<f, s>& > { - static constexpr bool value = true; -}; - - -template<typename... T> -struct IndexTuple; - -template<typename T, typename... O> -struct IndexTuple<T, O...> { - EIGEN_DEVICE_FUNC constexpr IndexTuple() : head(), others() { } - EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v, const O... o) : head(v), others(o...) { } - - constexpr static int count = 1 + sizeof...(O); - T head; - IndexTuple<O...> others; - typedef T Head; - typedef IndexTuple<O...> Other; -}; - -template<typename T> - struct IndexTuple<T> { - EIGEN_DEVICE_FUNC constexpr IndexTuple() : head() { } - EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v) : head(v) { } - - constexpr static int count = 1; - T head; - typedef T Head; -}; - - -template<int N, typename... T> -struct IndexTupleExtractor; - -template<int N, typename T, typename... O> -struct IndexTupleExtractor<N, T, O...> { - - typedef typename IndexTupleExtractor<N-1, O...>::ValType ValType; - - EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple<T, O...>& val) { - return IndexTupleExtractor<N-1, O...>::get_val(val.others); - } - - EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple<T, O...>& val) { - return IndexTupleExtractor<N-1, O...>::get_val(val.others); - } - template <typename V> - EIGEN_DEVICE_FUNC static void set_val(IndexTuple<T, O...>& val, V& new_val) { - IndexTupleExtractor<N-1, O...>::set_val(val.others, new_val); - } - -}; - -template<typename T, typename... O> - struct IndexTupleExtractor<0, T, O...> { - - typedef T ValType; - - EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple<T, O...>& val) { - return val.head; - } - EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple<T, O...>& val) { - return val.head; - } - template <typename V> - EIGEN_DEVICE_FUNC static void set_val(IndexTuple<T, O...>& val, V& new_val) { - val.head = new_val; - } -}; - - - -template <int N, typename T, typename... O> -EIGEN_DEVICE_FUNC constexpr typename IndexTupleExtractor<N, T, O...>::ValType& array_get(IndexTuple<T, O...>& tuple) { - return IndexTupleExtractor<N, T, O...>::get_val(tuple); -} -template <int N, typename T, typename... O> -EIGEN_DEVICE_FUNC constexpr const typename IndexTupleExtractor<N, T, O...>::ValType& array_get(const IndexTuple<T, O...>& tuple) { - return IndexTupleExtractor<N, T, O...>::get_val(tuple); -} -template <typename T, typename... O> - struct array_size<IndexTuple<T, O...> > { - static const size_t value = IndexTuple<T, O...>::count; -}; -template <typename T, typename... O> - struct array_size<const IndexTuple<T, O...> > { - static const size_t value = IndexTuple<T, O...>::count; -}; - - - - -template <Index Idx, typename ValueT> -struct tuple_coeff { - template <typename... T> - EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index i, const IndexTuple<T...>& t) { - // return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx); - return (i == Idx ? array_get<Idx>(t) : tuple_coeff<Idx-1, ValueT>::get(i, t)); - } - template <typename... T> - EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple<T...>& t, const ValueT& value) { - if (i == Idx) { - update_value(array_get<Idx>(t), value); - } else { - tuple_coeff<Idx-1, ValueT>::set(i, t, value); - } - } - - template <typename... T> - EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple<T...>& t) { - return ((i == Idx) & is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value) || - tuple_coeff<Idx-1, ValueT>::value_known_statically(i, t); - } - - template <typename... T> - EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>& t) { - return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value && - tuple_coeff<Idx-1, ValueT>::values_up_to_known_statically(t); - } - - template <typename... T> - EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple<T...>& t) { - return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value && - is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value && - array_get<Idx>(t) > array_get<Idx-1>(t) && - tuple_coeff<Idx-1, ValueT>::values_up_to_statically_known_to_increase(t); - } -}; - -template <typename ValueT> -struct tuple_coeff<0, ValueT> { - template <typename... T> - EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index /*i*/, const IndexTuple<T...>& t) { - // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr - return array_get<0>(t)/* * (i == 0)*/; - } - template <typename... T> - EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple<T...>& t, const ValueT value) { - eigen_assert (i == 0); - update_value(array_get<0>(t), value); - } - template <typename... T> - EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple<T...>&) { - return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value && (i == 0); - } - - template <typename... T> - EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>&) { - return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value; - } - - template <typename... T> - EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple<T...>&) { - return true; - } -}; -} // namespace internal - - - -template<typename FirstType, typename... OtherTypes> -struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> { - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index operator[] (const Index i) const { - return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::get(i, *this); - } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index get(const Index i) const { - return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::get(i, *this); - } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const Index value) { - return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::set(i, *this, value); - } - - EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { } - EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other) : internal::IndexTuple<FirstType, OtherTypes...>(first, other...) { } - EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple<FirstType, OtherTypes...>() { } - - EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const { - return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::value_known_statically(i, *this); - } - EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const { - return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::values_up_to_known_statically(*this); - } - - EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const { - return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::values_up_to_statically_known_to_increase(*this); - } -}; - -template <typename FirstType, typename... OtherTypes> -std::ostream& operator<<(std::ostream& os, - const IndexList<FirstType, OtherTypes...>& dims) { - os << "["; - for (size_t i = 0; i < 1 + sizeof...(OtherTypes); ++i) { - if (i > 0) os << ", "; - os << dims[i]; - } - os << "]"; - return os; -} - -template<typename FirstType, typename... OtherTypes> -constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) { - return IndexList<FirstType, OtherTypes...>(val1, other_vals...); -} - - -template<typename FirstType, typename... OtherTypes> -struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> { - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair<Index> operator[] (const Index i) const { - return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, IndexPair<Index>>::get(i, *this); - } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const IndexPair<Index> value) { - return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value-1, IndexPair<Index> >::set(i, *this, value); - } - - EIGEN_DEVICE_FUNC constexpr IndexPairList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { } - EIGEN_DEVICE_FUNC constexpr IndexPairList() : internal::IndexTuple<FirstType, OtherTypes...>() { } - - EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const { - return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::value_known_statically(i, *this); - } -}; - -namespace internal { - -template<typename FirstType, typename... OtherTypes> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index array_prod(const IndexList<FirstType, OtherTypes...>& sizes) { - Index result = 1; - EIGEN_UNROLL_LOOP - for (size_t i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) { - result *= sizes[i]; - } - return result; -} - -template<typename FirstType, typename... OtherTypes> struct array_size<IndexList<FirstType, OtherTypes...> > { - static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...> >::value; -}; -template<typename FirstType, typename... OtherTypes> struct array_size<const IndexList<FirstType, OtherTypes...> > { - static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...> >::value; -}; - -template<typename FirstType, typename... OtherTypes> struct array_size<IndexPairList<FirstType, OtherTypes...> > { - static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value; -}; -template<typename FirstType, typename... OtherTypes> struct array_size<const IndexPairList<FirstType, OtherTypes...> > { - static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value; -}; - -template<Index N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr Index array_get(IndexList<FirstType, OtherTypes...>& a) { - return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a); -} -template<Index N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr Index array_get(const IndexList<FirstType, OtherTypes...>& a) { - return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a); -} - -template <typename T> -struct index_known_statically_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index) { - return false; - } -}; - -template <typename FirstType, typename... OtherTypes> -struct index_known_statically_impl<IndexList<FirstType, OtherTypes...> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) { - return IndexList<FirstType, OtherTypes...>().value_known_statically(i); - } -}; - -template <typename FirstType, typename... OtherTypes> -struct index_known_statically_impl<const IndexList<FirstType, OtherTypes...> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) { - return IndexList<FirstType, OtherTypes...>().value_known_statically(i); - } -}; - - -template <typename T> -struct all_indices_known_statically_impl { - static constexpr bool run() { - return false; - } -}; - -template <typename FirstType, typename... OtherTypes> -struct all_indices_known_statically_impl<IndexList<FirstType, OtherTypes...> > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return IndexList<FirstType, OtherTypes...>().all_values_known_statically(); - } -}; - -template <typename FirstType, typename... OtherTypes> -struct all_indices_known_statically_impl<const IndexList<FirstType, OtherTypes...> > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return IndexList<FirstType, OtherTypes...>().all_values_known_statically(); - } -}; - - -template <typename T> -struct indices_statically_known_to_increase_impl { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return false; - } -}; - -template <typename FirstType, typename... OtherTypes> - struct indices_statically_known_to_increase_impl<IndexList<FirstType, OtherTypes...> > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return Eigen::IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase(); - } -}; - -template <typename FirstType, typename... OtherTypes> - struct indices_statically_known_to_increase_impl<const IndexList<FirstType, OtherTypes...> > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return Eigen::IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase(); - } -}; - - -template <typename Tx> -struct index_statically_eq_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { - return false; - } -}; - -template <typename FirstType, typename... OtherTypes> -struct index_statically_eq_impl<IndexList<FirstType, OtherTypes...> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList<FirstType, OtherTypes...>().value_known_statically(i) & - (IndexList<FirstType, OtherTypes...>().get(i) == value); - } -}; - -template <typename FirstType, typename... OtherTypes> -struct index_statically_eq_impl<const IndexList<FirstType, OtherTypes...> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList<FirstType, OtherTypes...>().value_known_statically(i) & - (IndexList<FirstType, OtherTypes...>().get(i) == value); - } -}; - - -template <typename T> -struct index_statically_ne_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { - return false; - } -}; - -template <typename FirstType, typename... OtherTypes> -struct index_statically_ne_impl<IndexList<FirstType, OtherTypes...> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList<FirstType, OtherTypes...>().value_known_statically(i) & - (IndexList<FirstType, OtherTypes...>().get(i) != value); - } -}; - -template <typename FirstType, typename... OtherTypes> -struct index_statically_ne_impl<const IndexList<FirstType, OtherTypes...> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList<FirstType, OtherTypes...>().value_known_statically(i) & - (IndexList<FirstType, OtherTypes...>().get(i) != value); - } -}; - - -template <typename T> -struct index_statically_gt_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { - return false; - } -}; - -template <typename FirstType, typename... OtherTypes> -struct index_statically_gt_impl<IndexList<FirstType, OtherTypes...> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList<FirstType, OtherTypes...>().value_known_statically(i) & - (IndexList<FirstType, OtherTypes...>().get(i) > value); - } -}; - -template <typename FirstType, typename... OtherTypes> -struct index_statically_gt_impl<const IndexList<FirstType, OtherTypes...> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList<FirstType, OtherTypes...>().value_known_statically(i) & - (IndexList<FirstType, OtherTypes...>().get(i) > value); - } -}; - - - -template <typename T> -struct index_statically_lt_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { - return false; - } -}; - -template <typename FirstType, typename... OtherTypes> -struct index_statically_lt_impl<IndexList<FirstType, OtherTypes...> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList<FirstType, OtherTypes...>().value_known_statically(i) & - (IndexList<FirstType, OtherTypes...>().get(i) < value); - } -}; - -template <typename FirstType, typename... OtherTypes> -struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList<FirstType, OtherTypes...>().value_known_statically(i) & - (IndexList<FirstType, OtherTypes...>().get(i) < value); - } -}; - - - -template <typename Tx> -struct index_pair_first_statically_eq_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { - return false; - } -}; - -template <typename FirstType, typename... OtherTypes> -struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) & - (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value); - } -}; - -template <typename FirstType, typename... OtherTypes> -struct index_pair_first_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) & - (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value); - } -}; - - - -template <typename Tx> -struct index_pair_second_statically_eq_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { - return false; - } -}; - -template <typename FirstType, typename... OtherTypes> -struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) & - (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value); - } -}; - -template <typename FirstType, typename... OtherTypes> -struct index_pair_second_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) & - (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value); - } -}; - - -} // end namespace internal -} // end namespace Eigen - -#else - -namespace Eigen { -namespace internal { - -template <typename T> -struct index_known_statically_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const Index) { - return false; - } -}; - -template <typename T> -struct all_indices_known_statically_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { - return false; - } -}; - -template <typename T> -struct indices_statically_known_to_increase_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { - return false; - } -}; - -template <typename T> -struct index_statically_eq_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { - return false; - } -}; - -template <typename T> -struct index_statically_ne_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { - return false; - } -}; - -template <typename T> -struct index_statically_gt_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { - return false; - } -}; - -template <typename T> -struct index_statically_lt_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { - return false; - } -}; - -template <typename Tx> -struct index_pair_first_statically_eq_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { - return false; - } -}; - -template <typename Tx> -struct index_pair_second_statically_eq_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { - return false; - } -}; - - - -} // end namespace internal -} // end namespace Eigen - -#endif - - -namespace Eigen { -namespace internal { -template <typename T> -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(Index i) { - return index_known_statically_impl<T>::run(i); -} - -template <typename T> -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool all_indices_known_statically() { - return all_indices_known_statically_impl<T>::run(); -} - -template <typename T> -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increase() { - return indices_statically_known_to_increase_impl<T>::run(); -} - -template <typename T> -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(Index i, Index value) { - return index_statically_eq_impl<T>::run(i, value); -} - -template <typename T> -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(Index i, Index value) { - return index_statically_ne_impl<T>::run(i, value); -} - -template <typename T> -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(Index i, Index value) { - return index_statically_gt_impl<T>::run(i, value); -} - -template <typename T> -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(Index i, Index value) { - return index_statically_lt_impl<T>::run(i, value); -} - -template <typename T> -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(Index i, Index value) { - return index_pair_first_statically_eq_impl<T>::run(i, value); -} - -template <typename T> -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(Index i, Index value) { - return index_pair_second_statically_eq_impl<T>::run(i, value); -} - -} // end namespace internal -} // end namespace Eigen - - -#endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorInflation.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorInflation.h deleted file mode 100644 index c5cb61a..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorInflation.h +++ /dev/null @@ -1,247 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Ke Yang <yangke@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H -#define EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H - -namespace Eigen { - -/** \class TensorInflation - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor inflation class. - * - * - */ -namespace internal { -template<typename Strides, typename XprType> -struct traits<TensorInflationOp<Strides, XprType> > : public traits<XprType> -{ - typedef typename XprType::Scalar Scalar; - typedef traits<XprType> XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; -}; - -template<typename Strides, typename XprType> -struct eval<TensorInflationOp<Strides, XprType>, Eigen::Dense> -{ - typedef const TensorInflationOp<Strides, XprType>& type; -}; - -template<typename Strides, typename XprType> -struct nested<TensorInflationOp<Strides, XprType>, 1, typename eval<TensorInflationOp<Strides, XprType> >::type> -{ - typedef TensorInflationOp<Strides, XprType> type; -}; - -} // end namespace internal - -template<typename Strides, typename XprType> -class TensorInflationOp : public TensorBase<TensorInflationOp<Strides, XprType>, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits<TensorInflationOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested<TensorInflationOp>::type Nested; - typedef typename Eigen::internal::traits<TensorInflationOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorInflationOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorInflationOp(const XprType& expr, const Strides& strides) - : m_xpr(expr), m_strides(strides) {} - - EIGEN_DEVICE_FUNC - const Strides& strides() const { return m_strides; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const Strides m_strides; -}; - -// Eval as rvalue -template<typename Strides, typename ArgType, typename Device> -struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device> -{ - typedef TensorInflationOp<Strides, ArgType> XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; - typedef DSizes<Index, NumDims> Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_strides(op.strides()) - { - m_dimensions = m_impl.dimensions(); - // Expand each dimension to the inflated dimension. - for (int i = 0; i < NumDims; ++i) { - m_dimensions[i] = (m_dimensions[i] - 1) * op.strides()[i] + 1; - } - - // Remember the strides for fast division. - for (int i = 0; i < NumDims; ++i) { - m_fastStrides[i] = internal::TensorIntDivisor<Index>(m_strides[i]); - } - - const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - m_outputStrides[0] = 1; - m_inputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - } - } else { // RowMajor - m_outputStrides[NumDims-1] = 1; - m_inputStrides[NumDims-1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; - m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - // Computes the input index given the output index. Returns true if the output - // index doesn't fall into a hole. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool getInputIndex(Index index, Index* inputIndex) const - { - eigen_assert(index < dimensions().TotalSize()); - *inputIndex = 0; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - EIGEN_UNROLL_LOOP - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - if (idx != idx / m_fastStrides[i] * m_strides[i]) { - return false; - } - *inputIndex += idx / m_strides[i] * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - if (index != index / m_fastStrides[0] * m_strides[0]) { - return false; - } - *inputIndex += index / m_strides[0]; - return true; - } else { - EIGEN_UNROLL_LOOP - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i]; - if (idx != idx / m_fastStrides[i] * m_strides[i]) { - return false; - } - *inputIndex += idx / m_strides[i] * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - if (index != index / m_fastStrides[NumDims-1] * m_strides[NumDims-1]) { - return false; - } - *inputIndex += index / m_strides[NumDims - 1]; - } - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - Index inputIndex = 0; - if (getInputIndex(index, &inputIndex)) { - return m_impl.coeff(inputIndex); - } else { - return Scalar(0); - } - } - - // TODO(yangke): optimize this function so that we can detect and produce - // all-zero packets - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; - EIGEN_UNROLL_LOOP - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload<PacketReturnType>(values); - return rslt; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - const double compute_cost = NumDims * (3 * TensorOpCost::DivCost<Index>() + - 3 * TensorOpCost::MulCost<Index>() + - 2 * TensorOpCost::AddCost<Index>()); - const double input_size = m_impl.dimensions().TotalSize(); - const double output_size = m_dimensions.TotalSize(); - if (output_size == 0) - return TensorOpCost(); - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0, - compute_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - - protected: - Dimensions m_dimensions; - array<Index, NumDims> m_outputStrides; - array<Index, NumDims> m_inputStrides; - TensorEvaluator<ArgType, Device> m_impl; - const Strides m_strides; - array<internal::TensorIntDivisor<Index>, NumDims> m_fastStrides; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorInitializer.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorInitializer.h deleted file mode 100644 index 26a3818..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorInitializer.h +++ /dev/null @@ -1,82 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H -#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H - -#if EIGEN_HAS_VARIADIC_TEMPLATES - -#include <initializer_list> - -namespace Eigen { - -/** \class TensorInitializer - * \ingroup CXX11_Tensor_Module - * - * \brief Helper template to initialize Tensors from std::initializer_lists. - */ -namespace internal { - -template <typename Derived, int N> -struct Initializer { - typedef std::initializer_list< - typename Initializer<Derived, N - 1>::InitList> InitList; - - static void run(TensorEvaluator<Derived, DefaultDevice>& tensor, - Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices, - const InitList& vals) { - int i = 0; - for (const auto& v : vals) { - (*indices)[traits<Derived>::NumDimensions - N] = i++; - Initializer<Derived, N - 1>::run(tensor, indices, v); - } - } -}; - -template <typename Derived> -struct Initializer<Derived, 1> { - typedef std::initializer_list<typename traits<Derived>::Scalar> InitList; - - static void run(TensorEvaluator<Derived, DefaultDevice>& tensor, - Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices, - const InitList& vals) { - int i = 0; - // There is likely a faster way to do that than iterating. - for (const auto& v : vals) { - (*indices)[traits<Derived>::NumDimensions - 1] = i++; - tensor.coeffRef(*indices) = v; - } - } -}; - -template <typename Derived> -struct Initializer<Derived, 0> { - typedef typename traits<Derived>::Scalar InitList; - - static void run(TensorEvaluator<Derived, DefaultDevice>& tensor, - Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>*, - const InitList& v) { - tensor.coeffRef(0) = v; - } -}; - - -template <typename Derived, int N> -void initialize_tensor(TensorEvaluator<Derived, DefaultDevice>& tensor, - const typename Initializer<Derived, traits<Derived>::NumDimensions>::InitList& vals) { - Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions> indices; - Initializer<Derived, traits<Derived>::NumDimensions>::run(tensor, &indices, vals); -} - -} // namespace internal -} // namespace Eigen - -#endif // EIGEN_HAS_VARIADIC_TEMPLATES - -#endif // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorIntDiv.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorIntDiv.h deleted file mode 100644 index 6d5cce4..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorIntDiv.h +++ /dev/null @@ -1,263 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H -#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H - - -namespace Eigen { - -/** \internal - * - * \class TensorIntDiv - * \ingroup CXX11_Tensor_Module - * - * \brief Fast integer division by a constant. - * - * See the paper from Granlund and Montgomery for explanation. - * (at https://doi.org/10.1145/773473.178249) - * - * \sa Tensor - */ - -namespace internal { - -namespace { - - // Note: result is undefined if val == 0 - template <typename T> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val) - { -#ifdef EIGEN_GPU_COMPILE_PHASE - return __clz(val); -#elif defined(SYCL_DEVICE_ONLY) - return cl::sycl::clz(val); -#elif EIGEN_COMP_MSVC - unsigned long index; - _BitScanReverse(&index, val); - return 31 - index; -#else - EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); - return __builtin_clz(static_cast<uint32_t>(val)); -#endif - } - - template <typename T> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val) - { -#ifdef EIGEN_GPU_COMPILE_PHASE - return __clzll(val); -#elif defined(SYCL_DEVICE_ONLY) - return static_cast<int>(cl::sycl::clz(val)); -#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64 - unsigned long index; - _BitScanReverse64(&index, val); - return 63 - index; -#elif EIGEN_COMP_MSVC - // MSVC's _BitScanReverse64 is not available for 32bits builds. - unsigned int lo = (unsigned int)(val&0xffffffff); - unsigned int hi = (unsigned int)((val>>32)&0xffffffff); - int n; - if(hi==0) - n = 32 + count_leading_zeros<unsigned int>(lo); - else - n = count_leading_zeros<unsigned int>(hi); - return n; -#else - EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); - return __builtin_clzll(static_cast<uint64_t>(val)); -#endif - } - - template <typename T> - struct UnsignedTraits { - typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type type; - }; - - template <typename T> - struct DividerTraits { - typedef typename UnsignedTraits<T>::type type; - static const int N = sizeof(T) * 8; - }; - - template <typename T> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { -#if defined(EIGEN_GPU_COMPILE_PHASE) - return __umulhi(a, b); -#elif defined(SYCL_DEVICE_ONLY) - return cl::sycl::mul_hi(a, static_cast<uint32_t>(b)); -#else - return (static_cast<uint64_t>(a) * b) >> 32; -#endif - } - - template <typename T> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { -#if defined(EIGEN_GPU_COMPILE_PHASE) - return __umul64hi(a, b); -#elif defined(SYCL_DEVICE_ONLY) - return cl::sycl::mul_hi(a, static_cast<uint64_t>(b)); -#elif EIGEN_HAS_BUILTIN_INT128 - __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b); - return static_cast<uint64_t>(v >> 64); -#else - return (TensorUInt128<static_val<0>, uint64_t>(a) * TensorUInt128<static_val<0>, uint64_t>(b)).upper(); -#endif - } - - template <int N, typename T> - struct DividerHelper { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) { - EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE); - return static_cast<uint32_t>((static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1); - } - }; - - template <typename T> - struct DividerHelper<64, T> { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { -#if EIGEN_HAS_BUILTIN_INT128 && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) - return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); -#else - const uint64_t shift = 1ULL << log_div; - TensorUInt128<uint64_t, uint64_t> result = TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - - TensorUInt128<static_val<1>, static_val<0> >(1, 0) - + TensorUInt128<static_val<0>, static_val<1> >(1); - return static_cast<uint64_t>(result); -#endif - } - }; -} - - -template <typename T, bool div_gt_one = false> -struct TensorIntDivisor { - public: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { - multiplier = 0; - shift1 = 0; - shift2 = 0; - } - - // Must have 0 < divider < 2^31. This is relaxed to - // 0 < divider < 2^63 when using 64-bit indices on platforms that support - // the __uint128_t type. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) { - const int N = DividerTraits<T>::N; - eigen_assert(static_cast<typename UnsignedTraits<T>::type>(divider) < NumTraits<UnsignedType>::highest()/2); - eigen_assert(divider > 0); - - // fast ln2 - const int leading_zeros = count_leading_zeros(static_cast<UnsignedType>(divider)); - int log_div = N - leading_zeros; - // if divider is a power of two then log_div is 1 more than it should be. - if ((static_cast<typename UnsignedTraits<T>::type>(1) << (log_div-1)) == static_cast<typename UnsignedTraits<T>::type>(divider)) - log_div--; - - multiplier = DividerHelper<N, T>::computeMultiplier(log_div, divider); - shift1 = log_div > 1 ? 1 : log_div; - shift2 = log_div > 1 ? log_div-1 : 0; - } - - // Must have 0 <= numerator. On platforms that don't support the __uint128_t - // type numerator should also be less than 2^32-1. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const { - eigen_assert(static_cast<typename UnsignedTraits<T>::type>(numerator) < NumTraits<UnsignedType>::highest()/2); - //eigen_assert(numerator >= 0); // this is implicitly asserted by the line above - - UnsignedType t1 = muluh(multiplier, numerator); - UnsignedType t = (static_cast<UnsignedType>(numerator) - t1) >> shift1; - return (t1 + t) >> shift2; - } - - private: - typedef typename DividerTraits<T>::type UnsignedType; - UnsignedType multiplier; - int32_t shift1; - int32_t shift2; -}; - - -// Optimized version for signed 32 bit integers. -// Derived from Hacker's Delight. -// Only works for divisors strictly greater than one -template <> -class TensorIntDivisor<int32_t, true> { - public: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { - magic = 0; - shift = 0; - } - // Must have 2 <= divider - EIGEN_DEVICE_FUNC TensorIntDivisor(int32_t divider) { - eigen_assert(divider >= 2); - calcMagic(divider); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { -#ifdef EIGEN_GPU_COMPILE_PHASE - return (__umulhi(magic, n) >> shift); -#elif defined(SYCL_DEVICE_ONLY) - return (cl::sycl::mul_hi(magic, static_cast<uint32_t>(n)) >> shift); -#else - uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n); - return (static_cast<uint32_t>(v >> 32) >> shift); -#endif - } - -private: - // Compute the magic numbers. See Hacker's Delight section 10 for an in - // depth explanation. - EIGEN_DEVICE_FUNC void calcMagic(int32_t d) { - const unsigned two31 = 0x80000000; // 2**31. - unsigned ad = d; - unsigned t = two31 + (ad >> 31); - unsigned anc = t - 1 - t%ad; // Absolute value of nc. - int p = 31; // Init. p. - unsigned q1 = two31/anc; // Init. q1 = 2**p/|nc|. - unsigned r1 = two31 - q1*anc; // Init. r1 = rem(2**p, |nc|). - unsigned q2 = two31/ad; // Init. q2 = 2**p/|d|. - unsigned r2 = two31 - q2*ad; // Init. r2 = rem(2**p, |d|). - unsigned delta = 0; - do { - p = p + 1; - q1 = 2*q1; // Update q1 = 2**p/|nc|. - r1 = 2*r1; // Update r1 = rem(2**p, |nc|). - if (r1 >= anc) { // (Must be an unsigned - q1 = q1 + 1; // comparison here). - r1 = r1 - anc;} - q2 = 2*q2; // Update q2 = 2**p/|d|. - r2 = 2*r2; // Update r2 = rem(2**p, |d|). - if (r2 >= ad) { // (Must be an unsigned - q2 = q2 + 1; // comparison here). - r2 = r2 - ad;} - delta = ad - r2; - } while (q1 < delta || (q1 == delta && r1 == 0)); - - magic = (unsigned)(q2 + 1); - shift = p - 32; - } - - uint32_t magic; - int32_t shift; -}; - - -template <typename T, bool div_gt_one> -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T, div_gt_one>& divisor) { - return divisor.divide(numerator); -} - - -} // end namespace internal -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorLayoutSwap.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorLayoutSwap.h deleted file mode 100644 index 80106c1..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorLayoutSwap.h +++ /dev/null @@ -1,216 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H -#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H - -namespace Eigen { - -/** \class TensorLayoutSwap - * \ingroup CXX11_Tensor_Module - * - * \brief Swap the layout from col-major to row-major, or row-major - * to col-major, and invert the order of the dimensions. - * - * Beware: the dimensions are reversed by this operation. If you want to - * preserve the ordering of the dimensions, you need to combine this - * operation with a shuffle. - * - * \example: - * Tensor<float, 2, ColMajor> input(2, 4); - * Tensor<float, 2, RowMajor> output = input.swap_layout(); - * eigen_assert(output.dimension(0) == 4); - * eigen_assert(output.dimension(1) == 2); - * - * array<int, 2> shuffle(1, 0); - * output = input.swap_layout().shuffle(shuffle); - * eigen_assert(output.dimension(0) == 2); - * eigen_assert(output.dimension(1) == 4); - * - */ -namespace internal { -template<typename XprType> -struct traits<TensorLayoutSwapOp<XprType> > : public traits<XprType> -{ - typedef typename XprType::Scalar Scalar; - typedef traits<XprType> XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = traits<XprType>::NumDimensions; - static const int Layout = (traits<XprType>::Layout == ColMajor) ? RowMajor : ColMajor; - typedef typename XprTraits::PointerType PointerType; -}; - -template<typename XprType> -struct eval<TensorLayoutSwapOp<XprType>, Eigen::Dense> -{ - typedef const TensorLayoutSwapOp<XprType>& type; -}; - -template<typename XprType> -struct nested<TensorLayoutSwapOp<XprType>, 1, typename eval<TensorLayoutSwapOp<XprType> >::type> -{ - typedef TensorLayoutSwapOp<XprType> type; -}; - -} // end namespace internal - - - -template<typename XprType> -class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors> -{ - public: - typedef TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors> Base; - typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; - typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested; - typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr) - : m_xpr(expr) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_xpr; } - - EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorLayoutSwapOp) - protected: - typename XprType::Nested m_xpr; -}; - - -// Eval as rvalue -template<typename ArgType, typename Device> -struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> -{ - typedef TensorLayoutSwapOp<ArgType> XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; - typedef DSizes<Index, NumDims> Dimensions; - - enum { - IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = false, - PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess, - Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator<ArgType, Device>::RawAccess - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) - { - for(int i = 0; i < NumDims; ++i) { - m_dimensions[i] = m_impl.dimensions()[NumDims-1-i]; - } - } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { - return m_impl.evalSubExprsIfNeeded(data); - } - EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_impl.coeff(index); - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_impl.template packet<LoadMode>(index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized); - } - - EIGEN_DEVICE_FUNC typename Storage::Type data() const { - return constCast(m_impl.data()); - } - - const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } - - protected: - TensorEvaluator<ArgType, Device> m_impl; - Dimensions m_dimensions; -}; - - -// Eval as lvalue -template<typename ArgType, typename Device> - struct TensorEvaluator<TensorLayoutSwapOp<ArgType>, Device> - : public TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> -{ - typedef TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> Base; - typedef TensorLayoutSwapOp<ArgType> XprType; - - enum { - IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = false, - PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess, - Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor, - CoordAccess = false // to be implemented - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) - { } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) - { - return this->m_impl.coeffRef(index); - } - template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - this->m_impl.template writePacket<StoreMode>(index, x); - } -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorMacros.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorMacros.h deleted file mode 100644 index 73ff3d2..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorMacros.h +++ /dev/null @@ -1,98 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H -#define EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H - - -/** use this macro in sfinae selection in templated functions - * - * template<typename T, - * typename std::enable_if< isBanana<T>::value , int >::type = 0 - * > - * void foo(){} - * - * becomes => - * - * template<typename TopoType, - * SFINAE_ENABLE_IF( isBanana<T>::value ) - * > - * void foo(){} - */ - -// SFINAE requires variadic templates -#if !defined(EIGEN_GPUCC) -#if EIGEN_HAS_VARIADIC_TEMPLATES - // SFINAE doesn't work for gcc <= 4.7 - #ifdef EIGEN_COMP_GNUC - #if EIGEN_GNUC_AT_LEAST(4,8) - #define EIGEN_HAS_SFINAE - #endif - #else - #define EIGEN_HAS_SFINAE - #endif -#endif -#endif - -#define EIGEN_SFINAE_ENABLE_IF( __condition__ ) \ - typename internal::enable_if< ( __condition__ ) , int >::type = 0 - -// Define a macro to use a reference on the host but a value on the device -#if defined(SYCL_DEVICE_ONLY) - #define EIGEN_DEVICE_REF -#else - #define EIGEN_DEVICE_REF & -#endif - -// Define a macro for catching SYCL exceptions if exceptions are enabled -#define EIGEN_SYCL_TRY_CATCH(X) \ - do { \ - EIGEN_TRY {X;} \ - EIGEN_CATCH(const cl::sycl::exception& e) { \ - EIGEN_THROW_X(std::runtime_error("SYCL exception at " + \ - std::string(__FILE__) + ":" + \ - std::to_string(__LINE__) + "\n" + \ - e.what())); \ - } \ - } while (false) - -// Define a macro if local memory flags are unset or one of them is set -// Setting both flags is the same as unsetting them -#if (!defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)) || \ - (defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)) - #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1 - #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1 -#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) - #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1 -#elif !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) - #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1 -#endif - -#if EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) - #define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ - using Base::operator =; \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \ - template <typename OtherDerived> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other) { Base::operator=(other); return *this; } -#else - #define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ - EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) -#endif - -/** \internal - * \brief Macro to manually inherit assignment operators. - * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined. - * This also inherits template<OtherDerived> operator=(const OtherDerived&) assignments. - * With C++11 or later this also default-implements the copy-constructor - */ -#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(Derived) \ - EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ - EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived) - -#endif diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorMap.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorMap.h deleted file mode 100644 index 6834c97..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorMap.h +++ /dev/null @@ -1,327 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H -#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H - -namespace Eigen { - -// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_) - -/** \class TensorMap - * \ingroup CXX11_Tensor_Module - * - * \brief A tensor expression mapping an existing array of data. - * - */ -/// `template <class> class MakePointer_` is added to convert the host pointer to the device pointer. -/// It is added due to the fact that for our device compiler `T*` is not allowed. -/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`. -/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_<T>` is `T*` . -/// Therefore, by adding the default value, we managed to convert the type and it does not break any -/// existing code as its default value is `T*`. -template<typename PlainObjectType, int Options_, template <class> class MakePointer_> class TensorMap : public TensorBase<TensorMap<PlainObjectType, Options_, MakePointer_> > -{ - public: - typedef TensorMap<PlainObjectType, Options_, MakePointer_> Self; - typedef TensorBase<TensorMap<PlainObjectType, Options_, MakePointer_> > Base; - #ifdef EIGEN_USE_SYCL - typedef typename Eigen::internal::remove_reference<typename Eigen::internal::nested<Self>::type>::type Nested; - #else - typedef typename Eigen::internal::nested<Self>::type Nested; - #endif - typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind; - typedef typename internal::traits<PlainObjectType>::Index Index; - typedef typename internal::traits<PlainObjectType>::Scalar Scalar; - typedef typename NumTraits<Scalar>::Real RealScalar; - typedef typename PlainObjectType::Base::CoeffReturnType CoeffReturnType; - - typedef typename MakePointer_<Scalar>::Type PointerType; - typedef typename MakePointer_<Scalar>::ConstType PointerConstType; - - // WARN: PointerType still can be a pointer to const (const Scalar*), for - // example in TensorMap<Tensor<const Scalar, ...>> expression. This type of - // expression should be illegal, but adding this restriction is not possible - // in practice (see https://bitbucket.org/eigen/eigen/pull-requests/488). - typedef typename internal::conditional< - bool(internal::is_lvalue<PlainObjectType>::value), - PointerType, // use simple pointer in lvalue expressions - PointerConstType // use const pointer in rvalue expressions - >::type StoragePointerType; - - // If TensorMap was constructed over rvalue expression (e.g. const Tensor), - // we should return a reference to const from operator() (and others), even - // if TensorMap itself is not const. - typedef typename internal::conditional< - bool(internal::is_lvalue<PlainObjectType>::value), - Scalar&, - const Scalar& - >::type StorageRefType; - - static const int Options = Options_; - - static const Index NumIndices = PlainObjectType::NumIndices; - typedef typename PlainObjectType::Dimensions Dimensions; - - enum { - IsAligned = ((int(Options_)&Aligned)==Aligned), - Layout = PlainObjectType::Layout, - CoordAccess = true, - RawAccess = true - }; - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr) : m_data(dataPtr), m_dimensions() { - // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template<typename... IndexTypes> EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { - // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) - } -#else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) { - // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) { - EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) { - EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) { - EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) { - EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const array<Index, NumIndices>& dimensions) - : m_data(dataPtr), m_dimensions(dimensions) - { } - - template <typename Dimensions> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const Dimensions& dimensions) - : m_data(dataPtr), m_dimensions(dimensions) - { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PlainObjectType& tensor) - : m_data(tensor.data()), m_dimensions(tensor.dimensions()) - { } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StoragePointerType data() { return m_data; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StoragePointerType data() const { return m_data; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices) const - { - // eigen_assert(checkIndexRange(indices)); - if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(indices); - return m_data[index]; - } else { - const Index index = m_dimensions.IndexOfColMajor(indices); - return m_data[index]; - } - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()() const - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) - return m_data[0]; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index index) const - { - eigen_internal_assert(index >= 0 && index < size()); - return m_data[index]; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template<typename... IndexTypes> EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const - { - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...)); - if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); - return m_data[index]; - } else { - const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); - return m_data[index]; - } - } -#else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1) const - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i1 + i0 * m_dimensions[1]; - return m_data[index]; - } else { - const Index index = i0 + i1 * m_dimensions[0]; - return m_data[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2) const - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); - return m_data[index]; - } else { - const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); - return m_data[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3) const - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); - return m_data[index]; - } else { - const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3)); - return m_data[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); - return m_data[index]; - } else { - const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4))); - return m_data[index]; - } - } -#endif - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices) - { - // eigen_assert(checkIndexRange(indices)); - if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(indices); - return m_data[index]; - } else { - const Index index = m_dimensions.IndexOfColMajor(indices); - return m_data[index]; - } - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()() - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) - return m_data[0]; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index index) - { - eigen_internal_assert(index >= 0 && index < size()); - return m_data[index]; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template<typename... IndexTypes> EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) - { - static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...)); - const std::size_t NumDims = sizeof...(otherIndices) + 2; - if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}}); - return m_data[index]; - } else { - const Index index = m_dimensions.IndexOfColMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}}); - return m_data[index]; - } - } -#else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1) - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i1 + i0 * m_dimensions[1]; - return m_data[index]; - } else { - const Index index = i0 + i1 * m_dimensions[0]; - return m_data[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2) - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); - return m_data[index]; - } else { - const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); - return m_data[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3) - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); - return m_data[index]; - } else { - const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3)); - return m_data[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4) - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); - return m_data[index]; - } else { - const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4))); - return m_data[index]; - } - } -#endif - - EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorMap) - - private: - StoragePointerType m_data; - Dimensions m_dimensions; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_MAP_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorMeta.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorMeta.h deleted file mode 100644 index a6181d3..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorMeta.h +++ /dev/null @@ -1,311 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_H -#define EIGEN_CXX11_TENSOR_TENSOR_META_H - -namespace Eigen { - -template<bool cond> struct Cond {}; - -template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -const T1& choose(Cond<true>, const T1& first, const T2&) { - return first; -} - -template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -const T2& choose(Cond<false>, const T1&, const T2& second) { - return second; -} - - -template <typename T, typename X, typename Y> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T divup(const X x, const Y y) { - return static_cast<T>((x + y - 1) / y); -} - -template <typename T> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T divup(const T x, const T y) { - return static_cast<T>((x + y - 1) / y); -} - -template <size_t n> struct max_n_1 { - static const size_t size = n; -}; -template <> struct max_n_1<0> { - static const size_t size = 1; -}; - - -// Default packet types -template <typename Scalar, typename Device> -struct PacketType : internal::packet_traits<Scalar> { - typedef typename internal::packet_traits<Scalar>::type type; -}; - -// For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) - -typedef ulonglong2 Packet4h2; -template<> -struct PacketType<half, GpuDevice> { - typedef Packet4h2 type; - static const int size = 8; - enum { - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasNegate = 1, - HasAbs = 1, - HasArg = 0, - HasAbs2 = 0, - HasMin = 1, - HasMax = 1, - HasConj = 0, - HasSetLinear = 0, - HasBlend = 0, - - HasDiv = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasExp = 1, - HasExpm1 = 0, - HasLog = 1, - HasLog1p = 0, - HasLog10 = 0, - HasPow = 1, - }; -}; -#endif - -#if defined(EIGEN_USE_SYCL) - -namespace TensorSycl { -namespace internal { - -template <typename Index, Index A, Index B> struct PlusOp { - static constexpr Index Value = A + B; -}; - -template <typename Index, Index A, Index B> struct DivOp { - static constexpr Index Value = A / B; -}; - -template <typename Index, Index start, Index end, Index step, - template <class Indx, Indx...> class StepOp> -struct static_for { - template <typename UnaryOperator> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator op) { - op(start); - static_for<Index, StepOp<Index, start, step>::Value, end, step, - StepOp>::loop(op); - } -}; -template <typename Index, Index end, Index step, - template <class Indx, Indx...> class StepOp> -struct static_for<Index, end, end, step, StepOp> { - template <typename UnaryOperator> - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator) {} -}; - -template <typename OutScalar, typename Device, bool Vectorizable> -struct Vectorise { - static const int PacketSize = 1; - typedef OutScalar PacketReturnType; -}; - -template <typename OutScalar, typename Device> -struct Vectorise<OutScalar, Device, true> { - static const int PacketSize = Eigen::PacketType<OutScalar, Device>::size; - typedef typename Eigen::PacketType<OutScalar, Device>::type PacketReturnType; -}; - -static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index roundUp(Index x, Index y) { - return ((((x) + (y)-1) / (y)) * (y)); -} - -} // namespace internal -} // namespace TensorSycl - -template <> - struct PacketType<half, SyclDevice> { - typedef half type; - static const int size = 1; - enum { - HasAdd = 0, - HasSub = 0, - HasMul = 0, - HasNegate = 0, - HasAbs = 0, - HasArg = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0, - HasSetLinear = 0, - HasBlend = 0 - }; -}; -template <typename Scalar> -struct PacketType<Scalar, SyclDevice> : internal::default_packet_traits { - typedef Scalar type; - typedef Scalar half; - enum { - Vectorizable = 0, - size = 1, - AlignedOnScalar = 0, - HasHalfPacket = 0 - }; - enum { - HasAdd = 0, - HasSub = 0, - HasMul = 0, - HasNegate = 0, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0, - HasSetLinear = 0 - }; - -}; - -template <typename Scalar> -struct PacketType<Scalar, const SyclDevice> : PacketType<Scalar, SyclDevice>{}; - -#ifndef EIGEN_DONT_VECTORIZE_SYCL -#define PACKET_TYPE(CVQual, Type, val, lengths, DEV)\ -template<> struct PacketType<CVQual Type, DEV> : internal::sycl_packet_traits<val, lengths> \ -{\ - typedef typename internal::packet_traits<Type>::type type;\ - typedef typename internal::packet_traits<Type>::half half;\ -}; - - -PACKET_TYPE(const, float, 1, 4, SyclDevice) -PACKET_TYPE(, float, 1, 4, SyclDevice) -PACKET_TYPE(const, float, 1, 4, const SyclDevice) -PACKET_TYPE(, float, 1, 4, const SyclDevice) - -PACKET_TYPE(const, double, 0, 2, SyclDevice) -PACKET_TYPE(, double, 0, 2, SyclDevice) -PACKET_TYPE(const, double, 0, 2, const SyclDevice) -PACKET_TYPE(, double, 0, 2, const SyclDevice) -#undef PACKET_TYPE - -template<> struct PacketType<half, const SyclDevice>: PacketType<half, SyclDevice>{}; -template<> struct PacketType<const half, const SyclDevice>: PacketType<half, SyclDevice>{}; -#endif -#endif - -// Tuple mimics std::pair but works on e.g. nvcc. -template <typename U, typename V> struct Tuple { - public: - U first; - V second; - - typedef U first_type; - typedef V second_type; - - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Tuple() : first(), second() {} - - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Tuple(const U& f, const V& s) : first(f), second(s) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void swap(Tuple& rhs) { - using numext::swap; - swap(first, rhs.first); - swap(second, rhs.second); - } -}; - -template <typename U, typename V> -EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -bool operator==(const Tuple<U, V>& x, const Tuple<U, V>& y) { - return (x.first == y.first && x.second == y.second); -} - -template <typename U, typename V> -EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -bool operator!=(const Tuple<U, V>& x, const Tuple<U, V>& y) { - return !(x == y); -} - - -// Can't use std::pairs on cuda devices -template <typename Idx> struct IndexPair { - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {} - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {} - - EIGEN_DEVICE_FUNC void set(IndexPair<Idx> val) { - first = val.first; - second = val.second; - } - - Idx first; - Idx second; -}; - - -#ifdef EIGEN_HAS_SFINAE -namespace internal { - - template<typename IndexType, typename Index, Index... Is> - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - array<Index, sizeof...(Is)> customIndices2Array(IndexType& idx, numeric_list<Index, Is...>) { - return { idx[Is]... }; - } - template<typename IndexType, typename Index> - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - array<Index, 0> customIndices2Array(IndexType&, numeric_list<Index>) { - return array<Index, 0>(); - } - - /** Make an array (for index/dimensions) out of a custom index */ - template<typename Index, std::size_t NumIndices, typename IndexType> - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - array<Index, NumIndices> customIndices2Array(IndexType& idx) { - return customIndices2Array(idx, typename gen_numeric_list<Index, NumIndices>::type{}); - } - - - template <typename B, typename D> - struct is_base_of - { - - typedef char (&yes)[1]; - typedef char (&no)[2]; - - template <typename BB, typename DD> - struct Host - { - operator BB*() const; - operator DD*(); - }; - - template<typename T> - static yes check(D*, T); - static no check(B*, int); - - static const bool value = sizeof(check(Host<B,D>(), int())) == sizeof(yes); - }; - -} -#endif - - - -} // namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_META_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorMorphing.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorMorphing.h deleted file mode 100644 index b3f00f7..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorMorphing.h +++ /dev/null @@ -1,1102 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H -#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H - -namespace Eigen { - -/** \class TensorReshaping - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor reshaping class. - * - * - */ -namespace internal { -template<typename NewDimensions, typename XprType> -struct traits<TensorReshapingOp<NewDimensions, XprType> > : public traits<XprType> -{ - typedef typename XprType::Scalar Scalar; - typedef traits<XprType> XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = array_size<NewDimensions>::value; - static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; -}; - -template<typename NewDimensions, typename XprType> -struct eval<TensorReshapingOp<NewDimensions, XprType>, Eigen::Dense> -{ - typedef const TensorReshapingOp<NewDimensions, XprType>EIGEN_DEVICE_REF type; -}; - -template<typename NewDimensions, typename XprType> -struct nested<TensorReshapingOp<NewDimensions, XprType>, 1, typename eval<TensorReshapingOp<NewDimensions, XprType> >::type> -{ - typedef TensorReshapingOp<NewDimensions, XprType> type; -}; - -} // end namespace internal - - - -template<typename NewDimensions, typename XprType> -class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors> -{ - public: - typedef TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors> Base; - typedef typename Eigen::internal::traits<TensorReshapingOp>::Scalar Scalar; - typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; - typedef typename Eigen::internal::nested<TensorReshapingOp>::type Nested; - typedef typename Eigen::internal::traits<TensorReshapingOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorReshapingOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims) - : m_xpr(expr), m_dims(dims) {} - - EIGEN_DEVICE_FUNC - const NewDimensions& dimensions() const { return m_dims; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_xpr; } - - EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReshapingOp) - - protected: - typename XprType::Nested m_xpr; - const NewDimensions m_dims; -}; - - -// Eval as rvalue -template<typename NewDimensions, typename ArgType, typename Device> -struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> -{ - typedef TensorReshapingOp<NewDimensions, ArgType> XprType; - typedef NewDimensions Dimensions; - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage; - - static const int NumOutputDims = internal::array_size<Dimensions>::value; - static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; - - enum ReshapingKind { - // We do not use layout information to determine reshaping kind. - // Depending on the layout `N` can be inner or outer dimension. - OneByN = 0, // expr.reshape(1, N) - NByOne = 1, // expr.reshape(N, 1) - Runtime = 2 // Reshape dimensions are dynamic (specified at runtime). - }; - - // clang-format off - static const ReshapingKind kind = -#if defined(EIGEN_HAS_INDEX_LIST) - (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/0, /*value=*/1)) ? OneByN - : (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/1, /*value=*/1)) ? NByOne - : Runtime; -#else - Runtime; -#endif - // clang-format on - - enum { - IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - // For trivial reshapes with raw access to underlying data we will provide - // zero overhead block access. - // TODO(ezhulenev): Consider adding block access without raw access? - BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess && - NumInputDims > 0 && NumOutputDims > 0, - PreferBlockAccess = false, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator<ArgType, Device>::RawAccess - }; - - typedef typename internal::remove_const<Scalar>::type ScalarNoConst; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor<NumOutputDims, Index> TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; - - typedef - typename internal::TensorMaterializedBlock<ScalarNoConst, NumOutputDims, - Layout, Index> - TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_dimensions(op.dimensions()) - { - // The total size of the reshaped tensor must be equal to the total size - // of the input tensor. - eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions())); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - -#ifdef EIGEN_USE_THREADS - template <typename EvalSubExprsCallback> - EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType data, EvalSubExprsCallback done) { - m_impl.evalSubExprsIfNeededAsync(data, std::move(done)); - } -#endif - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { - return m_impl.evalSubExprsIfNeeded(data); - } - EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_impl.coeff(index); - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_impl.template packet<LoadMode>(index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - return internal::TensorBlockResourceRequirements::any(); - } - - // required in block(OutputTensorBlock* output_block) const - // For C++03 compatibility this must be defined outside the method - struct BlockIteratorState { - Index stride; - Index span; - Index size; - Index count; - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - eigen_assert(m_impl.data() != NULL); - eigen_assert((kind == Runtime) || - (kind == OneByN && desc.dimensions()[0] == 1) || - (kind == NByOne && desc.dimensions()[1] == 1)); - - if (kind == OneByN || kind == NByOne) { - // We can guarantee at compile time that block is just a contiguous slice - // of the underlying expression memory buffer. - return TensorBlock(internal::TensorBlockKind::kView, - m_impl.data() + desc.offset(), desc.dimensions()); - } else { - // This will do additional runtime checks, and in the end it might be also - // a view, or it might be a block materialized in the temporary buffer. - return TensorBlock::materialize(m_impl.data(), m_dimensions, desc, - scratch); - } - } - - EIGEN_DEVICE_FUNC typename Storage::Type data() const { - return constCast(m_impl.data()); - } - - EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } - - #ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } - #endif - protected: - TensorEvaluator<ArgType, Device> m_impl; - NewDimensions m_dimensions; -}; - - -// Eval as lvalue -template<typename NewDimensions, typename ArgType, typename Device> - struct TensorEvaluator<TensorReshapingOp<NewDimensions, ArgType>, Device> - : public TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> - -{ - typedef TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> Base; - typedef TensorReshapingOp<NewDimensions, ArgType> XprType; - typedef NewDimensions Dimensions; - - enum { - IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess, - PreferBlockAccess = false, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator<ArgType, Device>::RawAccess - }; - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) - { } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor<TensorEvaluator::NumOutputDims, Index> - TensorBlockDesc; - //===--------------------------------------------------------------------===// - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) - { - return this->m_impl.coeffRef(index); - } - - template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - this->m_impl.template writePacket<StoreMode>(index, x); - } - - template <typename TensorBlock> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( - const TensorBlockDesc& desc, const TensorBlock& block) { - assert(this->m_impl.data() != NULL); - - typedef typename TensorBlock::XprType TensorBlockExpr; - typedef internal::TensorBlockAssignment< - Scalar, TensorEvaluator::NumOutputDims, TensorBlockExpr, Index> - TensorBlockAssign; - - TensorBlockAssign::Run( - TensorBlockAssign::target(desc.dimensions(), - internal::strides<Layout>(this->dimensions()), - this->m_impl.data(), desc.offset()), - block.expr()); - } -}; - - -/** \class TensorSlicing - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor slicing class. - * - * - */ -namespace internal { -template<typename StartIndices, typename Sizes, typename XprType> -struct traits<TensorSlicingOp<StartIndices, Sizes, XprType> > : public traits<XprType> -{ - typedef typename XprType::Scalar Scalar; - typedef traits<XprType> XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = array_size<StartIndices>::value; - static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; -}; - -template<typename StartIndices, typename Sizes, typename XprType> -struct eval<TensorSlicingOp<StartIndices, Sizes, XprType>, Eigen::Dense> -{ - typedef const TensorSlicingOp<StartIndices, Sizes, XprType>EIGEN_DEVICE_REF type; -}; - -template<typename StartIndices, typename Sizes, typename XprType> -struct nested<TensorSlicingOp<StartIndices, Sizes, XprType>, 1, typename eval<TensorSlicingOp<StartIndices, Sizes, XprType> >::type> -{ - typedef TensorSlicingOp<StartIndices, Sizes, XprType> type; -}; - -} // end namespace internal - - - -template<typename StartIndices, typename Sizes, typename XprType> -class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> > -{ - public: - typedef TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> > Base; - typedef typename Eigen::internal::traits<TensorSlicingOp>::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested<TensorSlicingOp>::type Nested; - typedef typename Eigen::internal::traits<TensorSlicingOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorSlicingOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices, const Sizes& sizes) - : m_xpr(expr), m_indices(indices), m_sizes(sizes) {} - - EIGEN_DEVICE_FUNC - const StartIndices& startIndices() const { return m_indices; } - EIGEN_DEVICE_FUNC - const Sizes& sizes() const { return m_sizes; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_xpr; } - - EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorSlicingOp) - - protected: - typename XprType::Nested m_xpr; - const StartIndices m_indices; - const Sizes m_sizes; -}; - - -// Fixme: figure out the exact threshold -namespace { -template <typename Index, typename Device, bool BlockAccess> struct MemcpyTriggerForSlicing { - EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { } - EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const { - const bool prefer_block_evaluation = BlockAccess && total > 32*1024; - return !prefer_block_evaluation && contiguous > threshold_; - } - - private: - Index threshold_; -}; - -// It is very expensive to start the memcpy kernel on GPU: we therefore only -// use it for large copies. -#ifdef EIGEN_USE_GPU -template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, GpuDevice, BlockAccess> { - EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { } - EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; } -}; -#endif - -// It is very expensive to start the memcpy kernel on GPU: we therefore only -// use it for large copies. -#ifdef EIGEN_USE_SYCL -template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, Eigen::SyclDevice, BlockAccess> { - EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { } - EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; } -}; -#endif - -} - -// Eval as rvalue -template<typename StartIndices, typename Sizes, typename ArgType, typename Device> -struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> -{ - typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType; - static const int NumDims = internal::array_size<Sizes>::value; - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef Sizes Dimensions; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - // Alignment can't be guaranteed at compile time since it depends on the - // slice offsets and sizes. - IsAligned = false, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess && - // FIXME: Temporary workaround for bug in slicing of bool tensors. - !internal::is_same<typename internal::remove_const<Scalar>::type, bool>::value, - PreferBlockAccess = true, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, - RawAccess = false - }; - - typedef typename internal::remove_const<Scalar>::type ScalarNoConst; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; - - // Tensor slicing does not change the block type. - typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock - TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) - { - m_is_identity = true; - for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) { - eigen_assert(m_impl.dimensions()[i] >= - op.sizes()[i] + op.startIndices()[i]); - if (m_impl.dimensions()[i] != op.sizes()[i] || - op.startIndices()[i] != 0) { - m_is_identity = false; - } - } - - // No strides for scalars. - if (NumDims == 0) return; - - const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); - const Sizes& output_dims = op.sizes(); - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - m_inputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - } - - // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed. - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); - } - } else { - m_inputStrides[NumDims-1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; - } - - // Don't initialize m_fastOutputStrides[NumDims-1] since it won't ever be accessed. - m_outputStrides[NumDims-1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { - m_impl.evalSubExprsIfNeeded(NULL); - if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization - && data && m_impl.data()) { - Index contiguous_values = 1; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - for (int i = 0; i < NumDims; ++i) { - contiguous_values *= dimensions()[i]; - if (dimensions()[i] != m_impl.dimensions()[i]) { - break; - } - } - } else { - for (int i = NumDims-1; i >= 0; --i) { - contiguous_values *= dimensions()[i]; - if (dimensions()[i] != m_impl.dimensions()[i]) { - break; - } - } - } - // Use memcpy if it's going to be faster than using the regular evaluation. - const MemcpyTriggerForSlicing<Index, Device, BlockAccess> trigger(m_device); - if (trigger(internal::array_prod(dimensions()), contiguous_values)) { - EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data(); - for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { - Index offset = srcCoeff(i); - m_device.memcpy((void*)(m_device.get(data + i)), m_device.get(src+offset), contiguous_values * sizeof(Scalar)); - } - return false; - } - } - return true; - } - -#ifdef EIGEN_USE_THREADS - template <typename EvalSubExprsCallback> - EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType /*data*/, EvalSubExprsCallback done) { - m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); - } -#endif // EIGEN_USE_THREADS - - EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - if (m_is_identity) { - return m_impl.coeff(index); - } else { - return m_impl.coeff(srcCoeff(index)); - } - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - const int packetSize = PacketType<CoeffReturnType, Device>::size; - EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < internal::array_prod(dimensions())); - - if (m_is_identity) { - return m_impl.template packet<LoadMode>(index); - } - - Index inputIndices[] = {0, 0}; - Index indices[] = {index, index + packetSize - 1}; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - EIGEN_UNROLL_LOOP - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_fastOutputStrides[i]; - const Index idx1 = indices[1] / m_fastOutputStrides[i]; - inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; - inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; - indices[0] -= idx0 * m_outputStrides[i]; - indices[1] -= idx1 * m_outputStrides[i]; - } - inputIndices[0] += (indices[0] + m_offsets[0]); - inputIndices[1] += (indices[1] + m_offsets[0]); - } else { - EIGEN_UNROLL_LOOP - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx0 = indices[0] / m_fastOutputStrides[i]; - const Index idx1 = indices[1] / m_fastOutputStrides[i]; - inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; - inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; - indices[0] -= idx0 * m_outputStrides[i]; - indices[1] -= idx1 * m_outputStrides[i]; - } - inputIndices[0] += (indices[0] + m_offsets[NumDims-1]); - inputIndices[1] += (indices[1] + m_offsets[NumDims-1]); - } - if (inputIndices[1] - inputIndices[0] == packetSize - 1) { - PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]); - return rslt; - } - else { - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; - values[0] = m_impl.coeff(inputIndices[0]); - values[packetSize-1] = m_impl.coeff(inputIndices[1]); - EIGEN_UNROLL_LOOP - for (int i = 1; i < packetSize-1; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload<PacketReturnType>(values); - return rslt; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - const size_t target_size = m_device.lastLevelCacheSize(); - return internal::TensorBlockResourceRequirements::merge( - internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size), - m_impl.getResourceRequirements()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset())); - TensorBlock block = m_impl.block(arg_desc, scratch); - if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer(); - return block; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { - typename Storage::Type result = constCast(m_impl.data()); - if (result) { - Index offset = 0; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - for (int i = 0; i < NumDims; ++i) { - if (m_dimensions[i] != m_impl.dimensions()[i]) { - offset += m_offsets[i] * m_inputStrides[i]; - for (int j = i+1; j < NumDims; ++j) { - if (m_dimensions[j] > 1) { - return NULL; - } - offset += m_offsets[j] * m_inputStrides[j]; - } - break; - } - } - } else { - for (int i = NumDims - 1; i >= 0; --i) { - if (m_dimensions[i] != m_impl.dimensions()[i]) { - offset += m_offsets[i] * m_inputStrides[i]; - for (int j = i-1; j >= 0; --j) { - if (m_dimensions[j] > 1) { - return NULL; - } - offset += m_offsets[j] * m_inputStrides[j]; - } - break; - } - } - } - return result + offset; - } - return NULL; - } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - - protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const - { - Index inputIndex = 0; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - EIGEN_UNROLL_LOOP - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - inputIndex += (index + m_offsets[0]); - } else { - EIGEN_UNROLL_LOOP - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - inputIndex += (index + m_offsets[NumDims-1]); - } - return inputIndex; - } - - array<Index, NumDims> m_outputStrides; - array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides; - array<Index, NumDims> m_inputStrides; - TensorEvaluator<ArgType, Device> m_impl; - const Device EIGEN_DEVICE_REF m_device; - Dimensions m_dimensions; - bool m_is_identity; - const StartIndices m_offsets; -}; - - -// Eval as lvalue -template<typename StartIndices, typename Sizes, typename ArgType, typename Device> -struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> - : public TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> -{ - typedef TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> Base; - typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType; - static const int NumDims = internal::array_size<Sizes>::value; - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef Sizes Dimensions; - - enum { - IsAligned = false, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess, - PreferBlockAccess = true, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, - RawAccess = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess - }; - - typedef typename internal::remove_const<Scalar>::type ScalarNoConst; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) - { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) - { - if (this->m_is_identity) { - return this->m_impl.coeffRef(index); - } else { - return this->m_impl.coeffRef(this->srcCoeff(index)); - } - } - - template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - if (this->m_is_identity) { - this->m_impl.template writePacket<StoreMode>(index, x); - return; - } - - const int packetSize = PacketType<CoeffReturnType, Device>::size; - Index inputIndices[] = {0, 0}; - Index indices[] = {index, index + packetSize - 1}; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - EIGEN_UNROLL_LOOP - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; - const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; - inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; - inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; - indices[0] -= idx0 * this->m_outputStrides[i]; - indices[1] -= idx1 * this->m_outputStrides[i]; - } - inputIndices[0] += (indices[0] + this->m_offsets[0]); - inputIndices[1] += (indices[1] + this->m_offsets[0]); - } else { - EIGEN_UNROLL_LOOP - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; - const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; - inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; - inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; - indices[0] -= idx0 * this->m_outputStrides[i]; - indices[1] -= idx1 * this->m_outputStrides[i]; - } - inputIndices[0] += (indices[0] + this->m_offsets[NumDims-1]); - inputIndices[1] += (indices[1] + this->m_offsets[NumDims-1]); - } - if (inputIndices[1] - inputIndices[0] == packetSize - 1) { - this->m_impl.template writePacket<StoreMode>(inputIndices[0], x); - } - else { - EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; - internal::pstore<CoeffReturnType, PacketReturnType>(values, x); - this->m_impl.coeffRef(inputIndices[0]) = values[0]; - this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; - EIGEN_UNROLL_LOOP - for (int i = 1; i < packetSize-1; ++i) { - this->coeffRef(index+i) = values[i]; - } - } - } - - template<typename TensorBlock> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( - const TensorBlockDesc& desc, const TensorBlock& block) { - TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset())); - this->m_impl.writeBlock(arg_desc, block); - } -}; - -namespace internal { -template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> -struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> > : public traits<XprType> -{ - typedef typename XprType::Scalar Scalar; - typedef traits<XprType> XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = array_size<StartIndices>::value; - static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; -}; - -template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> -struct eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Eigen::Dense> -{ - typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>EIGEN_DEVICE_REF type; -}; - -template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> -struct nested<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, 1, typename eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >::type> -{ - typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> type; -}; - -} // end namespace internal - - -template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> -class TensorStridingSlicingOp : public TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> > -{ - public: - typedef TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> > Base; - typedef typename internal::traits<TensorStridingSlicingOp>::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename internal::nested<TensorStridingSlicingOp>::type Nested; - typedef typename internal::traits<TensorStridingSlicingOp>::StorageKind StorageKind; - typedef typename internal::traits<TensorStridingSlicingOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingSlicingOp( - const XprType& expr, const StartIndices& startIndices, - const StopIndices& stopIndices, const Strides& strides) - : m_xpr(expr), m_startIndices(startIndices), m_stopIndices(stopIndices), - m_strides(strides) {} - - EIGEN_DEVICE_FUNC - const StartIndices& startIndices() const { return m_startIndices; } - EIGEN_DEVICE_FUNC - const StartIndices& stopIndices() const { return m_stopIndices; } - EIGEN_DEVICE_FUNC - const StartIndices& strides() const { return m_strides; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_xpr; } - - EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingSlicingOp) - - protected: - typename XprType::Nested m_xpr; - const StartIndices m_startIndices; - const StopIndices m_stopIndices; - const Strides m_strides; -}; - -// Eval as rvalue -template<typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device> -struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> -{ - typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType; - static const int NumDims = internal::array_size<Strides>::value; - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - typedef Strides Dimensions; - - enum { - // Alignment can't be guaranteed at compile time since it depends on the - // slice offsets and sizes. - IsAligned = false, - PacketAccess = false, - BlockAccess = false, - PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess, - Layout = TensorEvaluator<ArgType, Device>::Layout, - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), - m_device(device), - m_strides(op.strides()) - { - // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero - DSizes<Index, NumDims> startIndicesClamped, stopIndicesClamped; - for (ptrdiff_t i = 0; i < internal::array_size<Dimensions>::value; ++i) { - eigen_assert(m_strides[i] != 0 && "0 stride is invalid"); - if (m_strides[i] > 0) { - startIndicesClamped[i] = - clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); - stopIndicesClamped[i] = - clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); - } else { - /* implies m_strides[i] < 0 by assert */ - startIndicesClamped[i] = - clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); - stopIndicesClamped[i] = - clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); - } - m_startIndices[i] = startIndicesClamped[i]; - } - - typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions; - const InputDimensions& input_dims = m_impl.dimensions(); - - // compute output tensor shape - m_is_identity = true; - for (int i = 0; i < NumDims; i++) { - Index interval = stopIndicesClamped[i] - startIndicesClamped[i]; - if (interval == 0 || ((interval < 0) != (m_strides[i] < 0))) { - m_dimensions[i] = 0; - } else { - m_dimensions[i] = - (interval / m_strides[i]) + (interval % m_strides[i] != 0 ? 1 : 0); - eigen_assert(m_dimensions[i] >= 0); - } - if (m_strides[i] != 1 || interval != m_impl.dimensions()[i]) { - m_is_identity = false; - } - } - - Strides output_dims = m_dimensions; - - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - m_inputStrides[0] = m_strides[0]; - m_offsets[0] = startIndicesClamped[0]; - Index previousDimProduct = 1; - for (int i = 1; i < NumDims; ++i) { - previousDimProduct *= input_dims[i-1]; - m_inputStrides[i] = previousDimProduct * m_strides[i]; - m_offsets[i] = startIndicesClamped[i] * previousDimProduct; - } - - // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed. - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); - } - } else { - m_inputStrides[NumDims-1] = m_strides[NumDims-1]; - m_offsets[NumDims-1] = startIndicesClamped[NumDims-1]; - Index previousDimProduct = 1; - for (int i = NumDims - 2; i >= 0; --i) { - previousDimProduct *= input_dims[i+1]; - m_inputStrides[i] = previousDimProduct * m_strides[i]; - m_offsets[i] = startIndicesClamped[i] * previousDimProduct; - } - - m_outputStrides[NumDims-1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - - EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - if (m_is_identity) { - return m_impl.coeff(index); - } else { - return m_impl.coeff(srcCoeff(index)); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { - return NULL; - } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const - { - Index inputIndex = 0; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - EIGEN_UNROLL_LOOP - for (int i = NumDims - 1; i >= 0; --i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += idx * m_inputStrides[i] + m_offsets[i]; - index -= idx * m_outputStrides[i]; - } - } else { - EIGEN_UNROLL_LOOP - for (int i = 0; i < NumDims; ++i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += idx * m_inputStrides[i] + m_offsets[i]; - index -= idx * m_outputStrides[i]; - } - } - return inputIndex; - } - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) { -#ifndef SYCL_DEVICE_ONLY - return numext::maxi(min, numext::mini(max,value)); -#else - return cl::sycl::clamp(value, min, max); -#endif - } - - array<Index, NumDims> m_outputStrides; - array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides; - array<Index, NumDims> m_inputStrides; - bool m_is_identity; - TensorEvaluator<ArgType, Device> m_impl; - const Device EIGEN_DEVICE_REF m_device; - DSizes<Index, NumDims> m_startIndices; // clamped startIndices - DSizes<Index, NumDims> m_dimensions; - DSizes<Index, NumDims> m_offsets; // offset in a flattened shape - const Strides m_strides; -}; - -// Eval as lvalue -template<typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device> -struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> - : public TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> -{ - typedef TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> Base; - typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType; - static const int NumDims = internal::array_size<Strides>::value; - - enum { - IsAligned = false, - PacketAccess = false, - BlockAccess = false, - PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess, - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) - { } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef Strides Dimensions; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) - { - if (this->m_is_identity) { - return this->m_impl.coeffRef(index); - } else { - return this->m_impl.coeffRef(this->srcCoeff(index)); - } - } -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorPadding.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorPadding.h deleted file mode 100644 index ee44382..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorPadding.h +++ /dev/null @@ -1,708 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H -#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H - -namespace Eigen { - -/** \class TensorPadding - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor padding class. - * At the moment only padding with a constant value is supported. - * - */ -namespace internal { -template<typename PaddingDimensions, typename XprType> -struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprType> -{ - typedef typename XprType::Scalar Scalar; - typedef traits<XprType> XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; -}; - -template<typename PaddingDimensions, typename XprType> -struct eval<TensorPaddingOp<PaddingDimensions, XprType>, Eigen::Dense> -{ - typedef const TensorPaddingOp<PaddingDimensions, XprType>& type; -}; - -template<typename PaddingDimensions, typename XprType> -struct nested<TensorPaddingOp<PaddingDimensions, XprType>, 1, typename eval<TensorPaddingOp<PaddingDimensions, XprType> >::type> -{ - typedef TensorPaddingOp<PaddingDimensions, XprType> type; -}; - -} // end namespace internal - - - -template<typename PaddingDimensions, typename XprType> -class TensorPaddingOp : public TensorBase<TensorPaddingOp<PaddingDimensions, XprType>, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits<TensorPaddingOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested<TensorPaddingOp>::type Nested; - typedef typename Eigen::internal::traits<TensorPaddingOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorPaddingOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims, const Scalar padding_value) - : m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {} - - EIGEN_DEVICE_FUNC - const PaddingDimensions& padding() const { return m_padding_dims; } - EIGEN_DEVICE_FUNC - Scalar padding_value() const { return m_padding_value; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const PaddingDimensions m_padding_dims; - const Scalar m_padding_value; -}; - - -// Eval as rvalue -template<typename PaddingDimensions, typename ArgType, typename Device> -struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device> -{ - typedef TensorPaddingOp<PaddingDimensions, ArgType> XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size<PaddingDimensions>::value; - typedef DSizes<Index, NumDims> Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - IsAligned = true, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess, - PreferBlockAccess = true, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = true, - RawAccess = false - }; - - typedef typename internal::remove_const<Scalar>::type ScalarNoConst; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; - - typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims, - Layout, Index> - TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device) - { - // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead - // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector - // of 1 element first and then pad. - EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - - // Compute dimensions - m_dimensions = m_impl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - m_dimensions[i] += m_padding[i].first + m_padding[i].second; - } - const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - m_inputStrides[0] = 1; - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - } - m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1]; - } else { - m_inputStrides[NumDims - 1] = 1; - m_outputStrides[NumDims] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; - m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1]; - } - m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0]; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - -#ifdef EIGEN_USE_THREADS - template <typename EvalSubExprsCallback> - EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType, EvalSubExprsCallback done) { - m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); - } -#endif // EIGEN_USE_THREADS - - EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - eigen_assert(index < dimensions().TotalSize()); - Index inputIndex = 0; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - EIGEN_UNROLL_LOOP - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - if (isPaddingAtIndexForDim(idx, i)) { - return m_paddingValue; - } - inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - if (isPaddingAtIndexForDim(index, 0)) { - return m_paddingValue; - } - inputIndex += (index - m_padding[0].first); - } else { - EIGEN_UNROLL_LOOP - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i+1]; - if (isPaddingAtIndexForDim(idx, i)) { - return m_paddingValue; - } - inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; - index -= idx * m_outputStrides[i+1]; - } - if (isPaddingAtIndexForDim(index, NumDims-1)) { - return m_paddingValue; - } - inputIndex += (index - m_padding[NumDims-1].first); - } - return m_impl.coeff(inputIndex); - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - return packetColMajor(index); - } - return packetRowMajor(index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - TensorOpCost cost = m_impl.costPerCoeff(vectorized); - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - EIGEN_UNROLL_LOOP - for (int i = 0; i < NumDims; ++i) - updateCostPerDimension(cost, i, i == 0); - } else { - EIGEN_UNROLL_LOOP - for (int i = NumDims - 1; i >= 0; --i) - updateCostPerDimension(cost, i, i == NumDims - 1); - } - return cost; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - const size_t target_size = m_device.lastLevelCacheSize(); - return internal::TensorBlockResourceRequirements::merge( - internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size), - m_impl.getResourceRequirements()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - // If one of the dimensions is zero, return empty block view. - if (desc.size() == 0) { - return TensorBlock(internal::TensorBlockKind::kView, NULL, - desc.dimensions()); - } - - static const bool IsColMajor = Layout == static_cast<int>(ColMajor); - const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1; - - Index offset = desc.offset(); - - // Compute offsets in the output tensor corresponding to the desc.offset(). - DSizes<Index, NumDims> output_offsets; - for (int i = NumDims - 1; i > 0; --i) { - const int dim = IsColMajor ? i : NumDims - i - 1; - const int stride_dim = IsColMajor ? dim : dim + 1; - output_offsets[dim] = offset / m_outputStrides[stride_dim]; - offset -= output_offsets[dim] * m_outputStrides[stride_dim]; - } - output_offsets[inner_dim_idx] = offset; - - // Offsets in the input corresponding to output offsets. - DSizes<Index, NumDims> input_offsets = output_offsets; - for (int i = 0; i < NumDims; ++i) { - const int dim = IsColMajor ? i : NumDims - i - 1; - input_offsets[dim] = input_offsets[dim] - m_padding[dim].first; - } - - // Compute offset in the input buffer (at this point it might be illegal and - // point outside of the input buffer, because we don't check for negative - // offsets, it will be autocorrected in the block iteration loop below). - Index input_offset = 0; - for (int i = 0; i < NumDims; ++i) { - const int dim = IsColMajor ? i : NumDims - i - 1; - input_offset += input_offsets[dim] * m_inputStrides[dim]; - } - - // Destination buffer and scratch buffer both indexed from 0 and have the - // same dimensions as the requested block (for destination buffer this - // property is guaranteed by `desc.destination()`). - Index output_offset = 0; - const DSizes<Index, NumDims> output_strides = - internal::strides<Layout>(desc.dimensions()); - - // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1` - // dimensions, skipping innermost dimension. In theory it should be possible - // to squeeze matching innermost dimensions, however in practice that did - // not show any improvements in benchmarks. Also in practice first outer - // dimension usually has padding, and will prevent squeezing. - - // Initialize output block iterator state. Dimension in this array are - // always in inner_most -> outer_most order (col major layout). - array<BlockIteratorState, NumDims - 1> it; - for (int i = 0; i < NumDims - 1; ++i) { - const int dim = IsColMajor ? i + 1 : NumDims - i - 2; - it[i].count = 0; - it[i].size = desc.dimension(dim); - - it[i].input_stride = m_inputStrides[dim]; - it[i].input_span = it[i].input_stride * (it[i].size - 1); - - it[i].output_stride = output_strides[dim]; - it[i].output_span = it[i].output_stride * (it[i].size - 1); - } - - const Index input_inner_dim_size = - static_cast<Index>(m_impl.dimensions()[inner_dim_idx]); - - // Total output size. - const Index output_size = desc.size(); - - // We will fill inner dimension of this size in the output. It might be - // larger than the inner dimension in the input, so we might have to pad - // before/after we copy values from the input inner dimension. - const Index output_inner_dim_size = desc.dimension(inner_dim_idx); - - // How many values to fill with padding BEFORE reading from the input inner - // dimension. - const Index output_inner_pad_before_size = - input_offsets[inner_dim_idx] < 0 - ? numext::mini(numext::abs(input_offsets[inner_dim_idx]), - output_inner_dim_size) - : 0; - - // How many values we can actually copy from the input inner dimension. - const Index output_inner_copy_size = numext::mini( - // Want to copy from input. - (output_inner_dim_size - output_inner_pad_before_size), - // Can copy from input. - numext::maxi(input_inner_dim_size - (input_offsets[inner_dim_idx] + - output_inner_pad_before_size), - Index(0))); - - eigen_assert(output_inner_copy_size >= 0); - - // How many values to fill with padding AFTER reading from the input inner - // dimension. - const Index output_inner_pad_after_size = - (output_inner_dim_size - output_inner_copy_size - - output_inner_pad_before_size); - - // Sanity check, sum of all sizes must be equal to the output size. - eigen_assert(output_inner_dim_size == - (output_inner_pad_before_size + output_inner_copy_size + - output_inner_pad_after_size)); - - // Keep track of current coordinates and padding in the output. - DSizes<Index, NumDims> output_coord = output_offsets; - DSizes<Index, NumDims> output_padded; - for (int i = 0; i < NumDims; ++i) { - const int dim = IsColMajor ? i : NumDims - i - 1; - output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); - } - - typedef internal::StridedLinearBufferCopy<ScalarNoConst, Index> LinCopy; - - // Prepare storage for the materialized padding result. - const typename TensorBlock::Storage block_storage = - TensorBlock::prepareStorage(desc, scratch); - - // TODO(ezhulenev): Squeeze multiple non-padded inner dimensions into a - // single logical inner dimension. - - // When possible we squeeze writes for the innermost (only if non-padded) - // dimension with the first padded dimension. This allows to reduce the - // number of calls to LinCopy and better utilize vector instructions. - const bool squeeze_writes = - NumDims > 1 && - // inner dimension is not padded - (input_inner_dim_size == m_dimensions[inner_dim_idx]) && - // and equal to the block inner dimension - (input_inner_dim_size == output_inner_dim_size); - - const int squeeze_dim = IsColMajor ? inner_dim_idx + 1 : inner_dim_idx - 1; - - // Maximum coordinate on a squeeze dimension that we can write to. - const Index squeeze_max_coord = - squeeze_writes ? numext::mini( - // max non-padded element in the input - static_cast<Index>(m_dimensions[squeeze_dim] - - m_padding[squeeze_dim].second), - // max element in the output buffer - static_cast<Index>(output_offsets[squeeze_dim] + - desc.dimension(squeeze_dim))) - : static_cast<Index>(0); - - // Iterate copying data from `m_impl.data()` to the output buffer. - for (Index size = 0; size < output_size;) { - // Detect if we are in the padded region (exclude innermost dimension). - bool is_padded = false; - for (int j = 1; j < NumDims; ++j) { - const int dim = IsColMajor ? j : NumDims - j - 1; - is_padded = output_padded[dim]; - if (is_padded) break; - } - - if (is_padded) { - // Fill single innermost dimension with padding value. - size += output_inner_dim_size; - - LinCopy::template Run<LinCopy::Kind::FillLinear>( - typename LinCopy::Dst(output_offset, 1, block_storage.data()), - typename LinCopy::Src(0, 0, &m_paddingValue), - output_inner_dim_size); - - - } else if (squeeze_writes) { - // Squeeze multiple reads from innermost dimensions. - const Index squeeze_num = squeeze_max_coord - output_coord[squeeze_dim]; - size += output_inner_dim_size * squeeze_num; - - // Copy `squeeze_num` inner dimensions from input to output. - LinCopy::template Run<LinCopy::Kind::Linear>( - typename LinCopy::Dst(output_offset, 1, block_storage.data()), - typename LinCopy::Src(input_offset, 1, m_impl.data()), - output_inner_dim_size * squeeze_num); - - // Update iteration state for only `squeeze_num - 1` processed inner - // dimensions, because we have another iteration state update at the end - // of the loop that will update iteration state for the last inner - // processed dimension. - it[0].count += (squeeze_num - 1); - input_offset += it[0].input_stride * (squeeze_num - 1); - output_offset += it[0].output_stride * (squeeze_num - 1); - output_coord[squeeze_dim] += (squeeze_num - 1); - - } else { - // Single read from innermost dimension. - size += output_inner_dim_size; - - { // Fill with padding before copying from input inner dimension. - const Index out = output_offset; - - LinCopy::template Run<LinCopy::Kind::FillLinear>( - typename LinCopy::Dst(out, 1, block_storage.data()), - typename LinCopy::Src(0, 0, &m_paddingValue), - output_inner_pad_before_size); - } - - { // Copy data from input inner dimension. - const Index out = output_offset + output_inner_pad_before_size; - const Index in = input_offset + output_inner_pad_before_size; - - eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL); - - LinCopy::template Run<LinCopy::Kind::Linear>( - typename LinCopy::Dst(out, 1, block_storage.data()), - typename LinCopy::Src(in, 1, m_impl.data()), - output_inner_copy_size); - } - - { // Fill with padding after copying from input inner dimension. - const Index out = output_offset + output_inner_pad_before_size + - output_inner_copy_size; - - LinCopy::template Run<LinCopy::Kind::FillLinear>( - typename LinCopy::Dst(out, 1, block_storage.data()), - typename LinCopy::Src(0, 0, &m_paddingValue), - output_inner_pad_after_size); - } - } - - for (int j = 0; j < NumDims - 1; ++j) { - const int dim = IsColMajor ? j + 1 : NumDims - j - 2; - - if (++it[j].count < it[j].size) { - input_offset += it[j].input_stride; - output_offset += it[j].output_stride; - output_coord[dim] += 1; - output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); - break; - } - it[j].count = 0; - input_offset -= it[j].input_span; - output_offset -= it[j].output_span; - output_coord[dim] -= it[j].size - 1; - output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); - } - } - - return block_storage.AsTensorMaterializedBlock(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - - private: - struct BlockIteratorState { - BlockIteratorState() - : count(0), - size(0), - input_stride(0), - input_span(0), - output_stride(0), - output_span(0) {} - - Index count; - Index size; - Index input_stride; - Index input_span; - Index output_stride; - Index output_span; - }; - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim( - Index index, int dim_index) const { -#if defined(EIGEN_HAS_INDEX_LIST) - return (!internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0) && - index < m_padding[dim_index].first) || - (!internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0) && - index >= m_dimensions[dim_index] - m_padding[dim_index].second); -#else - return (index < m_padding[dim_index].first) || - (index >= m_dimensions[dim_index] - m_padding[dim_index].second); -#endif - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero( - int dim_index) const { -#if defined(EIGEN_HAS_INDEX_LIST) - return internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0); -#else - EIGEN_UNUSED_VARIABLE(dim_index); - return false; -#endif - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero( - int dim_index) const { -#if defined(EIGEN_HAS_INDEX_LIST) - return internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0); -#else - EIGEN_UNUSED_VARIABLE(dim_index); - return false; -#endif - } - - - void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const { - const double in = static_cast<double>(m_impl.dimensions()[i]); - const double out = in + m_padding[i].first + m_padding[i].second; - if (out == 0) - return; - const double reduction = in / out; - cost *= reduction; - if (first) { - cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() + - reduction * (1 * TensorOpCost::AddCost<Index>())); - } else { - cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() + - 2 * TensorOpCost::MulCost<Index>() + - reduction * (2 * TensorOpCost::MulCost<Index>() + - 1 * TensorOpCost::DivCost<Index>())); - } - } - - protected: - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - const Index initialIndex = index; - Index inputIndex = 0; - EIGEN_UNROLL_LOOP - for (int i = NumDims - 1; i > 0; --i) { - const Index firstIdx = index; - const Index lastIdx = index + PacketSize - 1; - const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; - const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; - const Index lastPaddedRight = m_outputStrides[i+1]; - - if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) { - // all the coefficient are in the padding zone. - return internal::pset1<PacketReturnType>(m_paddingValue); - } - else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { - // all the coefficient are in the padding zone. - return internal::pset1<PacketReturnType>(m_paddingValue); - } - else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { - // all the coefficient are between the 2 padding zones. - const Index idx = index / m_outputStrides[i]; - inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - else { - // Every other case - return packetWithPossibleZero(initialIndex); - } - } - - const Index lastIdx = index + PacketSize - 1; - const Index firstIdx = index; - const Index lastPaddedLeft = m_padding[0].first; - const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); - const Index lastPaddedRight = m_outputStrides[1]; - - if (!isLeftPaddingCompileTimeZero(0) && lastIdx < lastPaddedLeft) { - // all the coefficient are in the padding zone. - return internal::pset1<PacketReturnType>(m_paddingValue); - } - else if (!isRightPaddingCompileTimeZero(0) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { - // all the coefficient are in the padding zone. - return internal::pset1<PacketReturnType>(m_paddingValue); - } - else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { - // all the coefficient are between the 2 padding zones. - inputIndex += (index - m_padding[0].first); - return m_impl.template packet<Unaligned>(inputIndex); - } - // Every other case - return packetWithPossibleZero(initialIndex); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - const Index initialIndex = index; - Index inputIndex = 0; - EIGEN_UNROLL_LOOP - for (int i = 0; i < NumDims - 1; ++i) { - const Index firstIdx = index; - const Index lastIdx = index + PacketSize - 1; - const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1]; - const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1]; - const Index lastPaddedRight = m_outputStrides[i]; - - if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) { - // all the coefficient are in the padding zone. - return internal::pset1<PacketReturnType>(m_paddingValue); - } - else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { - // all the coefficient are in the padding zone. - return internal::pset1<PacketReturnType>(m_paddingValue); - } - else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { - // all the coefficient are between the 2 padding zones. - const Index idx = index / m_outputStrides[i+1]; - inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; - index -= idx * m_outputStrides[i+1]; - } - else { - // Every other case - return packetWithPossibleZero(initialIndex); - } - } - - const Index lastIdx = index + PacketSize - 1; - const Index firstIdx = index; - const Index lastPaddedLeft = m_padding[NumDims-1].first; - const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second); - const Index lastPaddedRight = m_outputStrides[NumDims-1]; - - if (!isLeftPaddingCompileTimeZero(NumDims-1) && lastIdx < lastPaddedLeft) { - // all the coefficient are in the padding zone. - return internal::pset1<PacketReturnType>(m_paddingValue); - } - else if (!isRightPaddingCompileTimeZero(NumDims-1) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { - // all the coefficient are in the padding zone. - return internal::pset1<PacketReturnType>(m_paddingValue); - } - else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { - // all the coefficient are between the 2 padding zones. - inputIndex += (index - m_padding[NumDims-1].first); - return m_impl.template packet<Unaligned>(inputIndex); - } - // Every other case - return packetWithPossibleZero(initialIndex); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const - { - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; - EIGEN_UNROLL_LOOP - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload<PacketReturnType>(values); - return rslt; - } - - Dimensions m_dimensions; - array<Index, NumDims+1> m_outputStrides; - array<Index, NumDims> m_inputStrides; - TensorEvaluator<ArgType, Device> m_impl; - PaddingDimensions m_padding; - - Scalar m_paddingValue; - - const Device EIGEN_DEVICE_REF m_device; -}; - - - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorPatch.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorPatch.h deleted file mode 100644 index 413d25d..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorPatch.h +++ /dev/null @@ -1,291 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H -#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H - -namespace Eigen { - -/** \class TensorPatch - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor patch class. - * - * - */ -namespace internal { -template<typename PatchDim, typename XprType> -struct traits<TensorPatchOp<PatchDim, XprType> > : public traits<XprType> -{ - typedef typename XprType::Scalar Scalar; - typedef traits<XprType> XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions + 1; - static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; -}; - -template<typename PatchDim, typename XprType> -struct eval<TensorPatchOp<PatchDim, XprType>, Eigen::Dense> -{ - typedef const TensorPatchOp<PatchDim, XprType>& type; -}; - -template<typename PatchDim, typename XprType> -struct nested<TensorPatchOp<PatchDim, XprType>, 1, typename eval<TensorPatchOp<PatchDim, XprType> >::type> -{ - typedef TensorPatchOp<PatchDim, XprType> type; -}; - -} // end namespace internal - - - -template<typename PatchDim, typename XprType> -class TensorPatchOp : public TensorBase<TensorPatchOp<PatchDim, XprType>, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits<TensorPatchOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested<TensorPatchOp>::type Nested; - typedef typename Eigen::internal::traits<TensorPatchOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorPatchOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims) - : m_xpr(expr), m_patch_dims(patch_dims) {} - - EIGEN_DEVICE_FUNC - const PatchDim& patch_dims() const { return m_patch_dims; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const PatchDim m_patch_dims; -}; - - -// Eval as rvalue -template<typename PatchDim, typename ArgType, typename Device> -struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> -{ - typedef TensorPatchOp<PatchDim, ArgType> XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1; - typedef DSizes<Index, NumDims> Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - - enum { - IsAligned = false, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = false, - PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) - { - Index num_patches = 1; - const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); - const PatchDim& patch_dims = op.patch_dims(); - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - for (int i = 0; i < NumDims-1; ++i) { - m_dimensions[i] = patch_dims[i]; - num_patches *= (input_dims[i] - patch_dims[i] + 1); - } - m_dimensions[NumDims-1] = num_patches; - - m_inputStrides[0] = 1; - m_patchStrides[0] = 1; - for (int i = 1; i < NumDims-1; ++i) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_patchStrides[i] = m_patchStrides[i-1] * (input_dims[i-1] - patch_dims[i-1] + 1); - } - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - } - } else { - for (int i = 0; i < NumDims-1; ++i) { - m_dimensions[i+1] = patch_dims[i]; - num_patches *= (input_dims[i] - patch_dims[i] + 1); - } - m_dimensions[0] = num_patches; - - m_inputStrides[NumDims-2] = 1; - m_patchStrides[NumDims-2] = 1; - for (int i = NumDims-3; i >= 0; --i) { - m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; - m_patchStrides[i] = m_patchStrides[i+1] * (input_dims[i+1] - patch_dims[i+1] + 1); - } - m_outputStrides[NumDims-1] = 1; - for (int i = NumDims-2; i >= 0; --i) { - m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - - EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0; - // Find the location of the first element of the patch. - Index patchIndex = index / m_outputStrides[output_stride_index]; - // Find the offset of the element wrt the location of the first element. - Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index]; - Index inputIndex = 0; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - EIGEN_UNROLL_LOOP - for (int i = NumDims - 2; i > 0; --i) { - const Index patchIdx = patchIndex / m_patchStrides[i]; - patchIndex -= patchIdx * m_patchStrides[i]; - const Index offsetIdx = patchOffset / m_outputStrides[i]; - patchOffset -= offsetIdx * m_outputStrides[i]; - inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; - } - } else { - EIGEN_UNROLL_LOOP - for (int i = 0; i < NumDims - 2; ++i) { - const Index patchIdx = patchIndex / m_patchStrides[i]; - patchIndex -= patchIdx * m_patchStrides[i]; - const Index offsetIdx = patchOffset / m_outputStrides[i+1]; - patchOffset -= offsetIdx * m_outputStrides[i+1]; - inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; - } - } - inputIndex += (patchIndex + patchOffset); - return m_impl.coeff(inputIndex); - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0; - Index indices[2] = {index, index + PacketSize - 1}; - Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index], - indices[1] / m_outputStrides[output_stride_index]}; - Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index], - indices[1] - patchIndices[1] * m_outputStrides[output_stride_index]}; - - Index inputIndices[2] = {0, 0}; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - EIGEN_UNROLL_LOOP - for (int i = NumDims - 2; i > 0; --i) { - const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], - patchIndices[1] / m_patchStrides[i]}; - patchIndices[0] -= patchIdx[0] * m_patchStrides[i]; - patchIndices[1] -= patchIdx[1] * m_patchStrides[i]; - - const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i], - patchOffsets[1] / m_outputStrides[i]}; - patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i]; - patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i]; - - inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i]; - inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; - } - } else { - EIGEN_UNROLL_LOOP - for (int i = 0; i < NumDims - 2; ++i) { - const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], - patchIndices[1] / m_patchStrides[i]}; - patchIndices[0] -= patchIdx[0] * m_patchStrides[i]; - patchIndices[1] -= patchIdx[1] * m_patchStrides[i]; - - const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i+1], - patchOffsets[1] / m_outputStrides[i+1]}; - patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i+1]; - patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i+1]; - - inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i]; - inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; - } - } - inputIndices[0] += (patchIndices[0] + patchOffsets[0]); - inputIndices[1] += (patchIndices[1] + patchOffsets[1]); - - if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { - PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]); - return rslt; - } - else { - EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; - values[0] = m_impl.coeff(inputIndices[0]); - values[PacketSize-1] = m_impl.coeff(inputIndices[1]); - EIGEN_UNROLL_LOOP - for (int i = 1; i < PacketSize-1; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload<PacketReturnType>(values); - return rslt; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - const double compute_cost = NumDims * (TensorOpCost::DivCost<Index>() + - TensorOpCost::MulCost<Index>() + - 2 * TensorOpCost::AddCost<Index>()); - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - - protected: - Dimensions m_dimensions; - array<Index, NumDims> m_outputStrides; - array<Index, NumDims-1> m_inputStrides; - array<Index, NumDims-1> m_patchStrides; - - TensorEvaluator<ArgType, Device> m_impl; - -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorRandom.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorRandom.h deleted file mode 100644 index 37c1d1c..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorRandom.h +++ /dev/null @@ -1,322 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> -// Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd. -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H -#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H - -namespace Eigen { -namespace internal { - -namespace { - -EIGEN_DEVICE_FUNC uint64_t get_random_seed() { -#if defined(EIGEN_GPU_COMPILE_PHASE) - // We don't support 3d kernels since we currently only use 1 and - // 2d kernels. - gpu_assert(threadIdx.z == 0); - return blockIdx.x * blockDim.x + threadIdx.x - + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); -#else - // Rely on Eigen's random implementation. - return random<uint64_t>(); -#endif -} - -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) { - // TODO: Unify with the implementation in the non blocking thread pool. - uint64_t current = *state; - // Update the internal state - *state = current * 6364136223846793005ULL + (stream << 1 | 1); - // Generate the random output (using the PCG-XSH-RS scheme) - return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61))); -} - -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) { - seed = seed ? seed : get_random_seed(); - return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; -} - -} // namespace - - -template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -T RandomToTypeUniform(uint64_t* state, uint64_t stream) { - unsigned rnd = PCG_XSH_RS_generator(state, stream); - return static_cast<T>(rnd); -} - - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) { - // Generate 10 random bits for the mantissa, merge with exponent. - unsigned rnd = PCG_XSH_RS_generator(state, stream); - const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10); - Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits); - // Return the final result - return result - Eigen::half(1.0f); -} - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) { - - // Generate 7 random bits for the mantissa, merge with exponent. - unsigned rnd = PCG_XSH_RS_generator(state, stream); - const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7); - Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits); - // Return the final result - return result - Eigen::bfloat16(1.0f); -} - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) { - typedef union { - uint32_t raw; - float fp; - } internal; - internal result; - // Generate 23 random bits for the mantissa mantissa - const unsigned rnd = PCG_XSH_RS_generator(state, stream); - result.raw = rnd & 0x7fffffu; - // Set the exponent - result.raw |= (static_cast<uint32_t>(127) << 23); - // Return the final result - return result.fp - 1.0f; -} - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) { - typedef union { - uint64_t raw; - double dp; - } internal; - internal result; - result.raw = 0; - // Generate 52 random bits for the mantissa - // First generate the upper 20 bits - unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu; - // The generate the lower 32 bits - unsigned rnd2 = PCG_XSH_RS_generator(state, stream); - result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2; - // Set the exponent - result.raw |= (static_cast<uint64_t>(1023) << 52); - // Return the final result - return result.dp - 1.0; -} - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) { - return std::complex<float>(RandomToTypeUniform<float>(state, stream), - RandomToTypeUniform<float>(state, stream)); -} -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) { - return std::complex<double>(RandomToTypeUniform<double>(state, stream), - RandomToTypeUniform<double>(state, stream)); -} - -template <typename T> class UniformRandomGenerator { - public: - static const bool PacketAccess = true; - - // Uses the given "seed" if non-zero, otherwise uses a random seed. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( - uint64_t seed = 0) { - m_state = PCG_XSH_RS_state(seed); - #ifdef EIGEN_USE_SYCL - // In SYCL it is not possible to build PCG_XSH_RS_state in one step. - // Therefor, we need two step to initializate the m_state. - // IN SYCL, the constructor of the functor is s called on the CPU - // and we get the clock seed here from the CPU. However, This seed is - //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. - // and only available on the Operator() function (which is called on the GPU). - // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread - // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds - // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction - // similar to CUDA Therefore, the thread Id injection is not available at this stage. - //However when the operator() is called the thread ID will be avilable. So inside the opeator, - // we add the thrreadID, BlockId,... (which is equivalent of i) - //to the seed and construct the unique m_state per thead similar to cuda. - m_exec_once =false; - #endif - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( - const UniformRandomGenerator& other) { - m_state = other.m_state; - #ifdef EIGEN_USE_SYCL - m_exec_once =other.m_exec_once; - #endif - } - - template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - T operator()(Index i) const { - #ifdef EIGEN_USE_SYCL - if(!m_exec_once) { - // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread - // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side - m_state += (i * 6364136223846793005ULL); - m_exec_once =true; - } - #endif - T result = RandomToTypeUniform<T>(&m_state, i); - return result; - } - - template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Packet packetOp(Index i) const { - const int packetSize = internal::unpacket_traits<Packet>::size; - EIGEN_ALIGN_MAX T values[packetSize]; - #ifdef EIGEN_USE_SYCL - if(!m_exec_once) { - // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread - m_state += (i * 6364136223846793005ULL); - m_exec_once =true; - } - #endif - EIGEN_UNROLL_LOOP - for (int j = 0; j < packetSize; ++j) { - values[j] = RandomToTypeUniform<T>(&m_state, i); - } - return internal::pload<Packet>(values); - } - - private: - mutable uint64_t m_state; - #ifdef EIGEN_USE_SYCL - mutable bool m_exec_once; - #endif -}; - -template <typename Scalar> -struct functor_traits<UniformRandomGenerator<Scalar> > { - enum { - // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)). - Cost = 12 * NumTraits<Scalar>::AddCost * - ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)), - PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess - }; -}; - - - -template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -T RandomToTypeNormal(uint64_t* state, uint64_t stream) { - // Use the ratio of uniform method to generate numbers following a normal - // distribution. See for example Numerical Recipes chapter 7.3.9 for the - // details. - T u, v, q; - do { - u = RandomToTypeUniform<T>(state, stream); - v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5)); - const T x = u - T(0.449871); - const T y = numext::abs(v) + T(0.386595); - q = x*x + y * (T(0.196)*y - T(0.25472)*x); - } while (q > T(0.27597) && - (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u)); - - return v/u; -} - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) { - return std::complex<float>(RandomToTypeNormal<float>(state, stream), - RandomToTypeNormal<float>(state, stream)); -} -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) { - return std::complex<double>(RandomToTypeNormal<double>(state, stream), - RandomToTypeNormal<double>(state, stream)); -} - - -template <typename T> class NormalRandomGenerator { - public: - static const bool PacketAccess = true; - - // Uses the given "seed" if non-zero, otherwise uses a random seed. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) { - m_state = PCG_XSH_RS_state(seed); - #ifdef EIGEN_USE_SYCL - // In SYCL it is not possible to build PCG_XSH_RS_state in one step. - // Therefor, we need two steps to initializate the m_state. - // IN SYCL, the constructor of the functor is s called on the CPU - // and we get the clock seed here from the CPU. However, This seed is - //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. - // and only available on the Operator() function (which is called on the GPU). - // Therefore, the thread Id injection is not available at this stage. However when the operator() - //is called the thread ID will be avilable. So inside the opeator, - // we add the thrreadID, BlockId,... (which is equivalent of i) - //to the seed and construct the unique m_state per thead similar to cuda. - m_exec_once =false; - #endif - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator( - const NormalRandomGenerator& other) { - m_state = other.m_state; -#ifdef EIGEN_USE_SYCL - m_exec_once=other.m_exec_once; -#endif - } - - template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - T operator()(Index i) const { - #ifdef EIGEN_USE_SYCL - if(!m_exec_once) { - // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread - m_state += (i * 6364136223846793005ULL); - m_exec_once =true; - } - #endif - T result = RandomToTypeNormal<T>(&m_state, i); - return result; - } - - template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Packet packetOp(Index i) const { - const int packetSize = internal::unpacket_traits<Packet>::size; - EIGEN_ALIGN_MAX T values[packetSize]; - #ifdef EIGEN_USE_SYCL - if(!m_exec_once) { - // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread - m_state += (i * 6364136223846793005ULL); - m_exec_once =true; - } - #endif - EIGEN_UNROLL_LOOP - for (int j = 0; j < packetSize; ++j) { - values[j] = RandomToTypeNormal<T>(&m_state, i); - } - return internal::pload<Packet>(values); - } - - private: - mutable uint64_t m_state; - #ifdef EIGEN_USE_SYCL - mutable bool m_exec_once; - #endif -}; - - -template <typename Scalar> -struct functor_traits<NormalRandomGenerator<Scalar> > { - enum { - // On average, we need to generate about 3 random numbers - // 15 mul, 8 add, 1.5 logs - Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost + - 15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost + - 3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2, - PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess - }; -}; - - -} // end namespace internal -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorReduction.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorReduction.h deleted file mode 100644 index 583f462..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorReduction.h +++ /dev/null @@ -1,998 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <eigen@codeplay.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H -#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H - -// clang is incompatible with the CUDA syntax wrt making a kernel a class friend, -// so we'll use a macro to make clang happy. -#ifndef KERNEL_FRIEND -#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__)) -#define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 -#else -#define KERNEL_FRIEND friend -#endif -#endif - - -namespace Eigen { - - -/** \class TensorReduction - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor reduction class. - * - */ - -namespace internal { - template<typename Op, typename Dims, typename XprType,template <class> class MakePointer_ > - struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> > - : traits<XprType> -{ - typedef traits<XprType> XprTraits; - typedef typename XprTraits::Scalar Scalar; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value; - static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; - - template <class T> struct MakePointer { - // Intermediate typedef to workaround MSVC issue. - typedef MakePointer_<T> MakePointerT; - typedef typename MakePointerT::Type Type; - }; -}; - -template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_> -struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense> -{ - typedef const TensorReductionOp<Op, Dims, XprType, MakePointer_>& type; -}; - -template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_> -struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1, typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type> -{ - typedef TensorReductionOp<Op, Dims, XprType, MakePointer_> type; -}; - - -template <typename OutputDims> struct DimInitializer { - template <typename InputDims, typename ReducedDims> EIGEN_DEVICE_FUNC - static void run(const InputDims& input_dims, - const array<bool, internal::array_size<InputDims>::value>& reduced, - OutputDims* output_dims, ReducedDims* reduced_dims) { - const int NumInputDims = internal::array_size<InputDims>::value; - int outputIndex = 0; - int reduceIndex = 0; - for (int i = 0; i < NumInputDims; ++i) { - if (reduced[i]) { - (*reduced_dims)[reduceIndex] = input_dims[i]; - ++reduceIndex; - } else { - (*output_dims)[outputIndex] = input_dims[i]; - ++outputIndex; - } - } - } -}; - -template <> struct DimInitializer<Sizes<> > { - template <typename InputDims, typename Index, size_t Rank> EIGEN_DEVICE_FUNC - static void run(const InputDims& input_dims, const array<bool, Rank>&, - Sizes<>*, array<Index, Rank>* reduced_dims) { - const int NumInputDims = internal::array_size<InputDims>::value; - for (int i = 0; i < NumInputDims; ++i) { - (*reduced_dims)[i] = input_dims[i]; - } - } -}; - - -template <typename ReducedDims, int NumTensorDims, int Layout> -struct are_inner_most_dims { - static const bool value = false; -}; -template <typename ReducedDims, int NumTensorDims, int Layout> -struct preserve_inner_most_dims { - static const bool value = false; -}; - -#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES -template <typename ReducedDims, int NumTensorDims> -struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{ - static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>(); - static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0); - static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1); - static const bool value = tmp1 & tmp2 & tmp3; -}; -template <typename ReducedDims, int NumTensorDims> -struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{ - static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>(); - static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value); - static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1); - static const bool value = tmp1 & tmp2 & tmp3; - -}; -template <typename ReducedDims, int NumTensorDims> -struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{ - static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>(); - static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0); - static const bool value = tmp1 & tmp2; - -}; -template <typename ReducedDims, int NumTensorDims> -struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{ - static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>(); - static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1); - static const bool value = tmp1 & tmp2; -}; -#endif - - -template <int DimIndex, typename Self, typename Op> -struct GenericDimReducer { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { - EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) { - const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; - GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum); - } - } -}; -template <typename Self, typename Op> -struct GenericDimReducer<0, Self, Op> { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { - for (int j = 0; j < self.m_reducedDims[0]; ++j) { - const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; - reducer.reduce(self.m_impl.coeff(input), accum); - } - } -}; -template <typename Self, typename Op> -struct GenericDimReducer<-1, Self, Op> { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) { - reducer.reduce(self.m_impl.coeff(index), accum); - } -}; - -template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess), - bool UseTreeReduction = (!Self::ReducerTraits::IsStateful && - !Self::ReducerTraits::IsExactlyAssociative)> -struct InnerMostDimReducer { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { - typename Self::CoeffReturnType accum = reducer.initialize(); - for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { - reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); - } - return reducer.finalize(accum); - } -}; - -template <typename Self, typename Op> -struct InnerMostDimReducer<Self, Op, true, false> { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { - const typename Self::Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size; - const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; - typename Self::PacketReturnType paccum = reducer.template initializePacket<typename Self::PacketReturnType>(); - for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) { - reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum); - } - typename Self::CoeffReturnType accum = reducer.initialize(); - for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) { - reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); - } - return reducer.finalizeBoth(accum, paccum); - } -}; - -#if !defined(EIGEN_HIPCC) -static const int kLeafSize = 1024; - -template <typename Self, typename Op> -struct InnerMostDimReducer<Self, Op, false, true> { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType - reduce(const Self& self, typename Self::Index firstIndex, - typename Self::Index numValuesToReduce, Op& reducer) { - typename Self::CoeffReturnType accum = reducer.initialize(); - if (numValuesToReduce > kLeafSize) { - const typename Self::Index half = numValuesToReduce / 2; - reducer.reduce(reduce(self, firstIndex, half, reducer), &accum); - reducer.reduce( - reduce(self, firstIndex + half, numValuesToReduce - half, reducer), - &accum); - } else { - for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { - reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); - } - } - return reducer.finalize(accum); - } -}; - -template <typename Self, typename Op> -struct InnerMostDimReducer<Self, Op, true, true> { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType - reduce(const Self& self, typename Self::Index firstIndex, - typename Self::Index numValuesToReduce, Op& reducer) { - const typename Self::Index packetSize = - internal::unpacket_traits<typename Self::PacketReturnType>::size; - typename Self::CoeffReturnType accum = reducer.initialize(); - if (numValuesToReduce > packetSize * kLeafSize) { - // Make sure the split point is aligned on a packet boundary. - const typename Self::Index split = - packetSize * - divup(firstIndex + divup(numValuesToReduce, typename Self::Index(2)), - packetSize); - const typename Self::Index num_left = - numext::mini(split - firstIndex, numValuesToReduce); - reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum); - if (num_left < numValuesToReduce) { - reducer.reduce( - reduce(self, split, numValuesToReduce - num_left, reducer), &accum); - } - return reducer.finalize(accum); - } else { - const typename Self::Index UnrollSize = - (numValuesToReduce / (2*packetSize)) * 2*packetSize; - const typename Self::Index VectorizedSize = - (numValuesToReduce / packetSize) * packetSize; - typename Self::PacketReturnType paccum = - reducer.template initializePacket<typename Self::PacketReturnType>(); - typename Self::PacketReturnType paccum2 = - reducer.template initializePacket<typename Self::PacketReturnType>(); - for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) { - reducer.reducePacket( - self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum); - reducer.reducePacket( - self.m_impl.template packet<Unaligned>(firstIndex + j + packetSize), - &paccum2); - } - for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) { - reducer.reducePacket(self.m_impl.template packet<Unaligned>( - firstIndex + j), &paccum); - } - reducer.reducePacket(paccum2, &paccum); - for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; - ++j) { - reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); - } - return reducer.finalizeBoth(accum, paccum); - } - } -}; -#endif - -template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)> -struct InnerMostDimPreserver { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { - eigen_assert(false && "should never be called"); - } -}; - -template <int DimIndex, typename Self, typename Op> -struct InnerMostDimPreserver<DimIndex, Self, Op, true> { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { - EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) { - const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; - InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum); - } - } -}; - -template <typename Self, typename Op> -struct InnerMostDimPreserver<0, Self, Op, true> { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { - for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) { - const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; - reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum); - } - } -}; -template <typename Self, typename Op> -struct InnerMostDimPreserver<-1, Self, Op, true> { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { - eigen_assert(false && "should never be called"); - } -}; - -// Default full reducer -template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)> -struct FullReducer { - static const bool HasOptimizedImplementation = false; - - static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::EvaluatorPointerType output) { - const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions()); - *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer); - } -}; - - -#ifdef EIGEN_USE_THREADS -// Multithreaded full reducers -template <typename Self, typename Op, - bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)> -struct FullReducerShard { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, - typename Self::Index numValuesToReduce, Op& reducer, - typename Self::CoeffReturnType* output) { - *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce( - self, firstIndex, numValuesToReduce, reducer); - } -}; - -// Multithreaded full reducer -template <typename Self, typename Op, bool Vectorizable> -struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> { - static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful; - static const Index PacketSize = - unpacket_traits<typename Self::PacketReturnType>::size; - - // launch one reducer per thread and accumulate the result. - static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, - typename Self::CoeffReturnType* output) { - typedef typename Self::Index Index; - const Index num_coeffs = array_prod(self.m_impl.dimensions()); - if (num_coeffs == 0) { - *output = reducer.finalize(reducer.initialize()); - return; - } - const TensorOpCost cost = - self.m_impl.costPerCoeff(Vectorizable) + - TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable, - PacketSize); - const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( - num_coeffs, cost, device.numThreads()); - if (num_threads == 1) { - *output = - InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer); - return; - } - const Index blocksize = - std::floor<Index>(static_cast<float>(num_coeffs) / num_threads); - const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; - eigen_assert(num_coeffs >= numblocks * blocksize); - - Barrier barrier(internal::convert_index<unsigned int>(numblocks)); - MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize()); - for (Index i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run, - self, i * blocksize, blocksize, reducer, - &shards[i]); - } - typename Self::CoeffReturnType finalShard; - if (numblocks * blocksize < num_coeffs) { - finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce( - self, numblocks * blocksize, num_coeffs - numblocks * blocksize, - reducer); - } else { - finalShard = reducer.initialize(); - } - barrier.Wait(); - - for (Index i = 0; i < numblocks; ++i) { - reducer.reduce(shards[i], &finalShard); - } - *output = reducer.finalize(finalShard); - } -}; - -#endif - - -// Default inner reducer -template <typename Self, typename Op, typename Device> -struct InnerReducer { - static const bool HasOptimizedImplementation = false; - - EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { - eigen_assert(false && "Not implemented"); - return true; - } -}; - -// Default outer reducer -template <typename Self, typename Op, typename Device> -struct OuterReducer { - static const bool HasOptimizedImplementation = false; - - EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { - eigen_assert(false && "Not implemented"); - return true; - } -}; - -#ifdef EIGEN_USE_SYCL -// Default Generic reducer -template <typename Self, typename Op, typename Device> -struct GenericReducer { - static const bool HasOptimizedImplementation = false; - - EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { - eigen_assert(false && "Not implemented"); - return true; - } -}; -#endif - -#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) -template <int B, int N, typename S, typename R, typename I_> -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); - - -#if defined(EIGEN_HAS_GPU_FP16) -template <typename S, typename R, typename I_> -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*); -template <int B, int N, typename S, typename R, typename I_> -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*); -template <int NPT, typename S, typename R, typename I_> -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); - -#endif - -template <int NPT, typename S, typename R, typename I_> -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); - -template <int NPT, typename S, typename R, typename I_> -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); -#endif - -/** - * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op. - * This allows the reduction to have a different type for the accumulator than the input data type. - * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input - * with the accumulator and the other for reducing two accumulators. - * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for - * some properties of the input. - */ -template <typename Op, typename CoeffReturnType> -struct ReductionReturnType { -#if defined(EIGEN_USE_SYCL) - typedef typename remove_const<decltype(std::declval<Op>().initialize())>::type type; -#else - typedef typename remove_const<CoeffReturnType>::type type; -#endif -}; - -} // end namespace internal - - -template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_> -class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> { - public: - typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; - typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested; - typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims) - { } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer) - { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const XprType& expression() const { return m_expr; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Dims& dims() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Op& reducer() const { return m_reducer; } - - protected: - typename XprType::Nested m_expr; - const Dims m_dims; - const Op m_reducer; -}; - -template<typename ArgType, typename Device> -struct TensorReductionEvaluatorBase; - -// Eval as rvalue -template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device> -struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> -{ - typedef internal::reducer_traits<Op, Device> ReducerTraits; - typedef Dims ReducedDims; - typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType; - typedef typename XprType::Index Index; - typedef ArgType ChildType; - typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions; - static const int NumInputDims = internal::array_size<InputDimensions>::value; - static const int NumReducedDims = internal::array_size<Dims>::value; - static const int NumOutputDims = NumInputDims - NumReducedDims; - typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions; - typedef typename XprType::Scalar Scalar; - typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self; - static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess; - typedef typename internal::ReductionReturnType<Op, typename XprType::CoeffReturnType>::type CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const Index PacketSize = PacketType<CoeffReturnType, Device>::size; - - typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - // Subset of strides of the input tensor for the non-reduced dimensions. - // Indexed by output dimensions. - static const int NumPreservedStrides = max_n_1<NumOutputDims>::size; - - enum { - IsAligned = false, - PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess, - BlockAccess = false, - PreferBlockAccess = true, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - typedef typename internal::remove_const<Scalar>::type ScalarNoConst; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value; - static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value; - static const bool RunningFullReduction = (NumOutputDims==0); - - EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) - { - EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), - YOU_MADE_A_PROGRAMMING_MISTAKE); - - // Build the bitmap indicating if an input dimension is reduced or not. - for (int i = 0; i < NumInputDims; ++i) { - m_reduced[i] = false; - } - for (int i = 0; i < NumReducedDims; ++i) { - eigen_assert(op.dims()[i] >= 0); - eigen_assert(op.dims()[i] < NumInputDims); - m_reduced[op.dims()[i]] = true; - } - - const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); - internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims); - - // Precompute output strides. - if (NumOutputDims > 0) { - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - m_outputStrides[0] = 1; - for (int i = 1; i < NumOutputDims; ++i) { - m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]); - } - } else { - m_outputStrides[NumOutputDims - 1] = 1; - for (int i = NumOutputDims - 2; i >= 0; --i) { - m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]); - } - } - } - - // Precompute input strides. - if (NumInputDims > 0) { - array<Index, NumInputDims> input_strides; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - input_strides[0] = 1; - for (int i = 1; i < NumInputDims; ++i) { - input_strides[i] = input_strides[i-1] * input_dims[i-1]; - } - } else { - input_strides.back() = 1; - for (int i = NumInputDims - 2; i >= 0; --i) { - input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; - } - } - - int outputIndex = 0; - int reduceIndex = 0; - for (int i = 0; i < NumInputDims; ++i) { - if (m_reduced[i]) { - m_reducedStrides[reduceIndex] = input_strides[i]; - ++reduceIndex; - } else { - m_preservedStrides[outputIndex] = input_strides[i]; - m_output_to_input_dim_map[outputIndex] = i; - ++outputIndex; - } - } - } - - // Special case for full reductions - if (NumOutputDims == 0) { - m_preservedStrides[0] = internal::array_prod(input_dims); - } - - m_numValuesToReduce = - NumOutputDims == 0 - ? internal::array_prod(input_dims) - : (static_cast<int>(Layout) == static_cast<int>(ColMajor)) - ? m_preservedStrides[0] - : m_preservedStrides[NumOutputDims - 1]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE - bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) { - // Use the FullReducer if possible. - if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction && - internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation && - ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || - !RunningOnGPU))) { - bool need_assign = false; - if (!data) { - m_result = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType)))); - data = m_result; - need_assign = true; - } - Op reducer(m_reducer); - internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data); - return need_assign; - } - - // Attempt to use an optimized reduction. - else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) { - bool reducing_inner_dims = true; - for (int i = 0; i < NumReducedDims; ++i) { - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - reducing_inner_dims &= m_reduced[i]; - } else { - reducing_inner_dims &= m_reduced[NumInputDims - 1 - i]; - } - } - if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation && - (reducing_inner_dims || ReducingInnerMostDims)) { - const Index num_values_to_reduce = internal::array_prod(m_reducedDims); - const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); - if (!data) { - if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) || (RunningOnSycl)) { - data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); - m_result = data; - } - else { - return true; - } - } - Op reducer(m_reducer); - // For SYCL this if always return false - if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { - if (m_result) { - m_device.deallocate_temp(m_result); - m_result = NULL; - } - return true; - } else { - return (m_result != NULL); - } - } - - bool preserving_inner_dims = true; - for (int i = 0; i < NumReducedDims; ++i) { - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - preserving_inner_dims &= m_reduced[NumInputDims - 1 - i]; - } else { - preserving_inner_dims &= m_reduced[i]; - } - } - if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation && - preserving_inner_dims) { - const Index num_values_to_reduce = internal::array_prod(m_reducedDims); - const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); - if (!data) { - if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) || (RunningOnSycl)) { - data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); - m_result = data; - } - else { - return true; - } - } - Op reducer(m_reducer); - // For SYCL this if always return false - if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { - if (m_result) { - m_device.deallocate_temp(m_result); - m_result = NULL; - } - return true; - } else { - return (m_result != NULL); - } - } - #if defined(EIGEN_USE_SYCL) - // If there is no Optimised version for SYCL, the reduction expression - // must break into two subexpression and use the SYCL generic Reducer on the device. - if(RunningOnSycl) { - const Index num_values_to_reduce = internal::array_prod(m_reducedDims); - const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); - if (!data) { - data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); - m_result = data; - } - Op reducer(m_reducer); - internal::GenericReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); - return (m_result != NULL); - } - #endif - } - return true; - } - -#ifdef EIGEN_USE_THREADS - template <typename EvalSubExprsCallback> - EIGEN_STRONG_INLINE - void - evalSubExprsIfNeededAsync(EvaluatorPointerType data, - EvalSubExprsCallback done) { - m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) { - done(evalSubExprsIfNeededCommon(data)); - }); - } -#endif - - EIGEN_STRONG_INLINE - bool evalSubExprsIfNeeded(EvaluatorPointerType data) { - m_impl.evalSubExprsIfNeeded(NULL); - return evalSubExprsIfNeededCommon(data); - } - - EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - if (m_result) { - m_device.deallocate_temp(m_result); - m_result = NULL; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - if (( RunningFullReduction || RunningOnGPU) && m_result ) { - return *(m_result + index); - } - Op reducer(m_reducer); - if (ReducingInnerMostDims || RunningFullReduction) { - const Index num_values_to_reduce = - (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; - return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index), - num_values_to_reduce, reducer); - } else { - typename Self::CoeffReturnType accum = reducer.initialize(); - internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum); - return reducer.finalize(accum); - } - } - - // TODO(bsteiner): provide a more efficient implementation. - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions()))); - - if (RunningOnGPU && m_result) { - return internal::pload<PacketReturnType>(m_result + index); - } - - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; - if (ReducingInnerMostDims) { - const Index num_values_to_reduce = - (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; - const Index firstIndex = firstInput(index); - for (Index i = 0; i < PacketSize; ++i) { - Op reducer(m_reducer); - values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce, - num_values_to_reduce, reducer); - } - } else if (PreservingInnerMostDims) { - const Index firstIndex = firstInput(index); - const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1; - // TBD: extend this the the n innermost dimensions that we preserve. - if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) { - Op reducer(m_reducer); - typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>(); - internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum); - return reducer.finalizePacket(accum); - } else { - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index + i); - } - } - } else { - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index + i); - } - } - PacketReturnType rslt = internal::pload<PacketReturnType>(values); - return rslt; - } - - // Must be called after evalSubExprsIfNeeded(). - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - if (RunningFullReduction && m_result) { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); - } else { - const Index num_values_to_reduce = internal::array_prod(m_reducedDims); - const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost; - return m_impl.costPerCoeff(vectorized) * num_values_to_reduce + - TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); - } - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } - EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } - EIGEN_DEVICE_FUNC const Device& device() const { return m_device; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - m_result.bind(cgh); - } -#endif - - private: - template <int, typename, typename> friend struct internal::GenericDimReducer; - template <typename, typename, bool, bool> friend struct internal::InnerMostDimReducer; - template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver; - template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer; -#ifdef EIGEN_USE_THREADS - template <typename S, typename O, bool V> friend struct internal::FullReducerShard; -#endif -#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) - template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); -#if defined(EIGEN_HAS_GPU_FP16) - template <typename S, typename R, typename I_> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<Eigen::half>::type*); - template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<Eigen::half>::type*); - template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); -#endif - template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); - - template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); -#endif - -#if defined(EIGEN_USE_SYCL) - template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::GenericNondeterministicReducer; - // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer - template <typename, typename, typename> friend struct internal::GenericReducer; -#endif - - - template <typename S, typename O, typename D> friend struct internal::InnerReducer; - - struct BlockIteratorState { - Index input_dim; - Index output_size; - Index output_count; - }; - - // Returns the Index in the input tensor of the first value that needs to be - // used to compute the reduction at output index "index". - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { - if (ReducingInnerMostDims) { - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - return index * m_preservedStrides[0]; - } else { - return index * m_preservedStrides[NumPreservedStrides - 1]; - } - } - // TBD: optimize the case where we preserve the innermost dimensions. - Index startInput = 0; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - for (int i = NumOutputDims - 1; i > 0; --i) { - // This is index_i in the output tensor. - const Index idx = index / m_outputStrides[i]; - startInput += idx * m_preservedStrides[i]; - index -= idx * m_outputStrides[i]; - } - if (PreservingInnerMostDims) { - eigen_assert(m_preservedStrides[0] == 1); - startInput += index; - } else { - startInput += index * m_preservedStrides[0]; - } - } else { - for (int i = 0; i < NumOutputDims - 1; ++i) { - // This is index_i in the output tensor. - const Index idx = index / m_outputStrides[i]; - startInput += idx * m_preservedStrides[i]; - index -= idx * m_outputStrides[i]; - } - if (PreservingInnerMostDims) { - eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1); - startInput += index; - } else { - startInput += index * m_preservedStrides[NumPreservedStrides - 1]; - } - } - return startInput; - } - - // Bitmap indicating if an input dimension is reduced or not. - array<bool, NumInputDims> m_reduced; - // Dimensions of the output of the operation. - Dimensions m_dimensions; - // Precomputed strides for the output tensor. - array<Index, NumOutputDims> m_outputStrides; - array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides; - array<Index, NumPreservedStrides> m_preservedStrides; - // Map from output to input dimension index. - array<Index, NumOutputDims> m_output_to_input_dim_map; - // How many values go into each reduction - Index m_numValuesToReduce; - - // Subset of strides of the input tensor for the reduced dimensions. - // Indexed by reduced dimensions. - array<Index, NumReducedDims> m_reducedStrides; - // Size of the input dimensions that are reduced. - // Indexed by reduced dimensions. - array<Index, NumReducedDims> m_reducedDims; - - // Evaluator for the input expression. - TensorEvaluator<ArgType, Device> m_impl; - - // Operation to apply for computing the reduction. - Op m_reducer; - - // For full reductions -#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) - static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value; - static const bool RunningOnSycl = false; -#elif defined(EIGEN_USE_SYCL) -static const bool RunningOnSycl = internal::is_same<typename internal::remove_all<Device>::type, Eigen::SyclDevice>::value; -static const bool RunningOnGPU = false; -#else - static const bool RunningOnGPU = false; - static const bool RunningOnSycl = false; -#endif - EvaluatorPointerType m_result; - - const Device EIGEN_DEVICE_REF m_device; -}; - -template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device> -struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> -: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> { - typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Base; - EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){} -}; - - -template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_> -struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> -: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> { - - typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> Base; - EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){} - // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel - //Therefore the coeff function should be overridden by for SYCL kernel - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const { - return *(this->data() + index); - } - // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel - //Therefore the packet function should be overridden by for SYCL kernel - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const { - return internal::pload<typename Base::PacketReturnType>(this->data() + index); - } -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionCuda.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionCuda.h deleted file mode 100644 index 68780cd..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionCuda.h +++ /dev/null @@ -1,6 +0,0 @@ - -#if defined(__clang__) || defined(__GNUC__) -#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorReductionGpu.h file" -#endif - -#include "TensorReductionGpu.h" diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionGpu.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionGpu.h deleted file mode 100644 index db4e8d8..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionGpu.h +++ /dev/null @@ -1,966 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H -#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H - -namespace Eigen { -namespace internal { - - -#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) -// Full reducers for GPU, don't vectorize for now - -// Reducer function that enables multiple gpu thread to safely accumulate at the same -// output address. It basically reads the current value of the output variable, and -// attempts to update it with the new value. If in the meantime another gpu thread -// updated the content of the output address it will try again. -template <typename T, typename R> -__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { -#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) - if (sizeof(T) == 4) - { - unsigned int oldval = *reinterpret_cast<unsigned int*>(output); - unsigned int newval = oldval; - reducer.reduce(accum, reinterpret_cast<T*>(&newval)); - if (newval == oldval) { - return; - } - unsigned int readback; - while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { - oldval = readback; - newval = oldval; - reducer.reduce(accum, reinterpret_cast<T*>(&newval)); - if (newval == oldval) { - return; - } - } - } - else if (sizeof(T) == 8) { - unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output); - unsigned long long newval = oldval; - reducer.reduce(accum, reinterpret_cast<T*>(&newval)); - if (newval == oldval) { - return; - } - unsigned long long readback; - while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) { - oldval = readback; - newval = oldval; - reducer.reduce(accum, reinterpret_cast<T*>(&newval)); - if (newval == oldval) { - return; - } - } - } - else { - gpu_assert(0 && "Wordsize not supported"); - } -#else // EIGEN_CUDA_ARCH >= 300 - gpu_assert(0 && "Shouldn't be called on unsupported device"); -#endif // EIGEN_CUDA_ARCH >= 300 -} - -// We extend atomicExch to support extra data types -template <typename Type> -__device__ inline Type atomicExchCustom(Type* address, Type val) { - return atomicExch(address, val); -} - -template <> -__device__ inline double atomicExchCustom(double* address, double val) { - unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address); - return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); -} - -#ifdef EIGEN_HAS_GPU_FP16 -template <typename R> -__device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) { - unsigned int oldval = *reinterpret_cast<unsigned int*>(output); - unsigned int newval = oldval; - reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval)); - if (newval == oldval) { - return; - } - unsigned int readback; - while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { - oldval = readback; - newval = oldval; - reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval)); - if (newval == oldval) { - return; - } - } -} -// reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations -template <typename R> -__device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) { - half2* houtput=reinterpret_cast<half2*>(output); - half2* haccum=reinterpret_cast<half2*>(&accum); - for(int i=0;i<4;++i){ - atomicReduce(houtput+i,*(haccum+i),reducer); - } -} -#endif // EIGEN_HAS_GPU_FP16 - -template <> -__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) { -#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) - atomicAdd(output, accum); -#else // EIGEN_CUDA_ARCH >= 300 - gpu_assert(0 && "Shouldn't be called on unsupported device"); -#endif // EIGEN_CUDA_ARCH >= 300 -} - - -template <typename CoeffType, typename Index> -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) { - const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; - const Index num_threads = blockDim.x * gridDim.x; - for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { - output[i] = val; - } -} - - -template <int BlockSize, int NumPerThread, typename Self, - typename Reducer, typename Index> -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, - typename Self::CoeffReturnType* output, unsigned int* semaphore) { -#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) - // Initialize the output value - const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; - if (gridDim.x == 1) { - if (first_index == 0) { - *output = reducer.initialize(); - } - } - else { - if (threadIdx.x == 0) { - unsigned int block = atomicCAS(semaphore, 0u, 1u); - if (block == 0) { - // We're the first block to run, initialize the output value - atomicExchCustom(output, reducer.initialize()); - __threadfence(); - atomicExch(semaphore, 2u); - } - else { - // Wait for the first block to initialize the output value. - // Use atomicCAS here to ensure that the reads aren't cached - unsigned int val; - do { - val = atomicCAS(semaphore, 2u, 2u); - } - while (val < 2u); - } - } - } - - __syncthreads(); - - eigen_assert(gridDim.x == 1 || *semaphore >= 2u); - - typename Self::CoeffReturnType accum = reducer.initialize(); - Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize); - for (Index i = 0; i < max_iter; i+=BlockSize) { - const Index index = first_index + i; - eigen_assert(index < num_coeffs); - typename Self::CoeffReturnType val = input.m_impl.coeff(index); - reducer.reduce(val, &accum); - } - -#pragma unroll - for (int offset = warpSize/2; offset > 0; offset /= 2) { - #if defined(EIGEN_HIPCC) - // use std::is_floating_point to determine the type of reduced_val - // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error - // and list the float and int versions of __shfl_down as the candidate functions. - if (std::is_floating_point<typename Self::CoeffReturnType>::value) { - reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum); - } else { - reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum); - } - #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 - reducer.reduce(__shfl_down(accum, offset, warpSize), &accum); - #else - reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum); - #endif - } - - if ((threadIdx.x & (warpSize - 1)) == 0) { - atomicReduce(output, accum, reducer); - } - - if (gridDim.x > 1 && threadIdx.x == 0) { - // Let the last block reset the semaphore - atomicInc(semaphore, gridDim.x + 1); -#if defined(EIGEN_HIPCC) - __threadfence_system(); -#endif - } -#else // EIGEN_CUDA_ARCH >= 300 - gpu_assert(0 && "Shouldn't be called on unsupported device"); -#endif // EIGEN_CUDA_ARCH >= 300 -} - - -#ifdef EIGEN_HAS_GPU_FP16 -template <typename Self, - typename Reducer, typename Index> -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, - packet_traits<Eigen::half>::type* scratch) { - eigen_assert(blockDim.x == 1); - eigen_assert(gridDim.x == 1); - typedef packet_traits<Eigen::half>::type packet_type; - Index packet_remainder = - num_coeffs % Index(unpacket_traits<packet_type>::size); - if (packet_remainder != 0) { - half2* h2scratch = reinterpret_cast<half2*>(scratch); - for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) { - *h2scratch = - __halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1)); - h2scratch++; - } - if ((num_coeffs & 1) != 0) { - half lastCoeff = input.m_impl.coeff(num_coeffs - 1); - *h2scratch = __halves2half2(lastCoeff, reducer.initialize()); - } - } else { - *scratch = reducer.template initializePacket<packet_type>(); - } -} - -template <typename Self, - typename Reducer, typename Index> -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) { - const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; - const Index num_threads = blockDim.x * gridDim.x; - typedef typename packet_traits<Eigen::half>::type PacketType; - - const Index num_packets = - num_coeffs / Index(unpacket_traits<PacketType>::size); - PacketType* p_output = reinterpret_cast<PacketType*>(output); - for (Index i = thread_id; i < num_packets; i += num_threads) { - p_output[i] = reducer.template initializePacket<PacketType>(); - } - Index packet_remainder = - num_coeffs % Index(unpacket_traits<PacketType>::size); - if (thread_id < packet_remainder) { - output[num_coeffs - packet_remainder + thread_id] = reducer.initialize(); - } -} - -template <int BlockSize, int NumPerThread, typename Self, - typename Reducer, typename Index> -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, - half* output, packet_traits<Eigen::half>::type* scratch) { - typedef typename packet_traits<Eigen::half>::type PacketType; - const int packet_width = unpacket_traits<PacketType>::size; - eigen_assert(NumPerThread % packet_width == 0); - const Index first_index = - blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x; - - // Initialize the output value if it wasn't initialized by the ReductionInitKernel - - if (gridDim.x == 1) { - if (first_index == 0) { - int rem = num_coeffs % packet_width; - if (rem != 0) { - half2* p_scratch = reinterpret_cast<half2*>(scratch); - *scratch = reducer.template initializePacket<PacketType>(); - for (int i = 0; i < rem / 2; i++) { - *p_scratch = __halves2half2( - input.m_impl.coeff(num_coeffs - packet_width + 2 * i), - input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1)); - p_scratch++; - } - if ((num_coeffs & 1) != 0) { - half last = input.m_impl.coeff(num_coeffs - 1); - *p_scratch = __halves2half2(last, reducer.initialize()); - } - } else { - *scratch = reducer.template initializePacket<PacketType>(); - } - } - __syncthreads(); - } - - PacketType accum = reducer.template initializePacket<PacketType>(); - const Index max_iter = - numext::mini<Index>((num_coeffs - first_index) / packet_width, - NumPerThread * BlockSize / packet_width); - for (Index i = 0; i < max_iter; i += BlockSize) { - const Index index = first_index + packet_width * i; - eigen_assert(index + packet_width < num_coeffs); - PacketType val = input.m_impl.template packet<Unaligned>(index); - reducer.reducePacket(val, &accum); - } - -#pragma unroll - for (int offset = warpSize/2; offset > 0; offset /= 2) { - #if defined(EIGEN_HIPCC) - PacketType r1; - half2* hr = reinterpret_cast<half2*>(&r1); - half2* hacc = reinterpret_cast<half2*>(&accum); - for (int i = 0; i < packet_width / 2; i++) { - // FIXME : remove this workaround once we have native half/half2 support for __shfl_down - union { int i; half2 h; } wka_in, wka_out; - wka_in.h = hacc[i]; - wka_out.i = __shfl_down(wka_in.i, offset, warpSize); - hr[i] = wka_out.h; - } - reducer.reducePacket(r1, &accum); - #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 - PacketType r1; - half2* hr = reinterpret_cast<half2*>(&r1); - half2* hacc = reinterpret_cast<half2*>(&accum); - for (int i = 0; i < packet_width / 2; i++) { - hr[i] = __shfl_down(hacc[i], offset, warpSize); - } - reducer.reducePacket(r1, &accum); - #else - PacketType r1; - half2* hr = reinterpret_cast<half2*>(&r1); - half2* hacc = reinterpret_cast<half2*>(&accum); - for (int i = 0; i < packet_width / 2; i++) { - hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize); - } - reducer.reducePacket(r1, &accum); - - #endif - } - - if ((threadIdx.x & (warpSize - 1)) == 0) { - atomicReduce(scratch, accum, reducer); - } - - __syncthreads(); - half2* rv1 = reinterpret_cast<half2*>(scratch); - if (packet_width > 2) { - reducer.reducePacket(rv1[2], rv1); - reducer.reducePacket(rv1[3], rv1 + 1); - reducer.reducePacket(rv1[1], rv1); - } - if (gridDim.x == 1) { - if (first_index == 0) { - half tmp = __low2half(*rv1); - reducer.reduce(__high2half(*rv1), &tmp); - *output = tmp; - } - } -} - -template <typename Op> -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) { - eigen_assert(threadIdx.x == 1); - half2* pscratch = reinterpret_cast<half2*>(scratch); - half tmp = __float2half(0.f); - typedef packet_traits<Eigen::half>::type packet_type; - for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) { - reducer.reduce(__low2half(*pscratch), &tmp); - reducer.reduce(__high2half(*pscratch), &tmp); - pscratch++; - } - *output = tmp; -} - -#endif // EIGEN_HAS_GPU_FP16 - -template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> -struct FullReductionLauncher { - static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) { - gpu_assert(false && "Should only be called on doubles, floats and half floats"); - } -}; - -// Specialization for float and double -template <typename Self, typename Op, typename OutputType, bool PacketAccess> -struct FullReductionLauncher< - Self, Op, OutputType, PacketAccess, - typename internal::enable_if< - internal::is_same<float, OutputType>::value || - internal::is_same<double, OutputType>::value, - void>::type> { - static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) { - - typedef typename Self::Index Index; - const int block_size = 256; - const int num_per_thread = 128; - const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread); - - unsigned int* semaphore = NULL; - if (num_blocks > 1) { - semaphore = device.semaphore(); - } - - LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>), - num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore); - } -}; - -#ifdef EIGEN_HAS_GPU_FP16 -template <typename Self, typename Op> -struct FullReductionLauncher<Self, Op, Eigen::half, false> { - static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) { - gpu_assert(false && "Should not be called since there is no packet accessor"); - } -}; - -template <typename Self, typename Op> -struct FullReductionLauncher<Self, Op, Eigen::half, true> { - static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) { - typedef typename Self::Index Index; - typedef typename packet_traits<Eigen::half>::type PacketType; - - const int block_size = 256; - const int num_per_thread = 128; - const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread); - PacketType* scratch = static_cast<PacketType*>(device.scratchpad()); - // half2* scratch = static_cast<half2*>(device.scratchpad()); - - if (num_blocks > 1) { - // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there - // won't be a race conditions between multiple thread blocks. - LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>), - 1, 1, 0, device, reducer, self, num_coeffs, scratch); - } - - LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>), - num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch); - - if (num_blocks > 1) { - LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>), - 1, 1, 0, device, reducer, output, scratch); - } - } -}; -#endif // EIGEN_HAS_GPU_FP16 - - -template <typename Self, typename Op, bool Vectorizable> -struct FullReducer<Self, Op, GpuDevice, Vectorizable> { - // Unfortunately nvidia doesn't support well exotic types such as complex, - // so reduce the scope of the optimized version of the code to the simple cases - // of doubles, floats and half floats -#ifdef EIGEN_HAS_GPU_FP16 - static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && - (internal::is_same<typename Self::CoeffReturnType, float>::value || - internal::is_same<typename Self::CoeffReturnType, double>::value || - (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); -#else // EIGEN_HAS_GPU_FP16 - static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && - (internal::is_same<typename Self::CoeffReturnType, float>::value || - internal::is_same<typename Self::CoeffReturnType, double>::value); -#endif // EIGEN_HAS_GPU_FP16 - - template <typename OutputType> - static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { - gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); - const Index num_coeffs = array_prod(self.m_impl.dimensions()); - // Don't crash when we're called with an input tensor of size 0. - if (num_coeffs == 0) { - return; - } - - FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs); - } -}; - - -template <int NumPerThread, typename Self, - typename Reducer, typename Index> -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, - typename Self::CoeffReturnType* output) { -#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) - typedef typename Self::CoeffReturnType Type; - eigen_assert(blockDim.y == 1); - eigen_assert(blockDim.z == 1); - eigen_assert(gridDim.y == 1); - eigen_assert(gridDim.z == 1); - - const int unroll_times = 16; - eigen_assert(NumPerThread % unroll_times == 0); - - const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread); - const Index num_input_blocks = input_col_blocks * num_preserved_coeffs; - - const Index num_threads = blockDim.x * gridDim.x; - const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; - - // Initialize the output values if they weren't initialized by the ReductionInitKernel - if (gridDim.x == 1) { - for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { - output[i] = reducer.initialize(); - } - __syncthreads(); - } - - for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { - const Index row = i / input_col_blocks; - - if (row < num_preserved_coeffs) { - const Index col_block = i % input_col_blocks; - const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x; - - Type reduced_val = reducer.initialize(); - - for (Index j = 0; j < NumPerThread; j += unroll_times) { - const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1); - if (last_col >= num_coeffs_to_reduce) { - for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) { - const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col); - reducer.reduce(val, &reduced_val); - } - break; - } else { - // Faster version of the loop with no branches after unrolling. -#pragma unroll - for (int k = 0; k < unroll_times; ++k) { - const Index col = col_begin + blockDim.x * (j + k); - reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val); - } - } - } - -#pragma unroll - for (int offset = warpSize/2; offset > 0; offset /= 2) { - #if defined(EIGEN_HIPCC) - // use std::is_floating_point to determine the type of reduced_val - // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error - // and list the float and int versions of __shfl_down as the candidate functions. - if (std::is_floating_point<Type>::value) { - reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val); - } else { - reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val); - } - #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 - reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val); - #else - reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val); - #endif - } - - if ((threadIdx.x & (warpSize - 1)) == 0) { - atomicReduce(&(output[row]), reduced_val, reducer); - } - } - } -#else // EIGEN_CUDA_ARCH >= 300 - gpu_assert(0 && "Shouldn't be called on unsupported device"); -#endif // EIGEN_CUDA_ARCH >= 300 -} - -#ifdef EIGEN_HAS_GPU_FP16 - -template <int NumPerThread, typename Self, - typename Reducer, typename Index> -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, - half* output) { - eigen_assert(blockDim.y == 1); - eigen_assert(blockDim.z == 1); - eigen_assert(gridDim.y == 1); - eigen_assert(gridDim.z == 1); - - typedef typename packet_traits<Eigen::half>::type PacketType; - const int packet_width = unpacket_traits<PacketType>::size; - const int unroll_times = 16 / packet_width; - eigen_assert(NumPerThread % unroll_times == 0); - eigen_assert(unroll_times % 2 == 0); - - const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2); - const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2); - - const Index num_threads = blockDim.x * gridDim.x; - const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; - - // Initialize the output values if they weren't initialized by the ReductionInitKernel - if (gridDim.x == 1) { - Index i = packet_width * thread_id; - for (; i + packet_width <= num_preserved_coeffs; - i += packet_width * num_threads) { - PacketType* poutput = reinterpret_cast<PacketType*>(output + i); - *poutput = reducer.template initializePacket<PacketType>(); - } - if (i < num_preserved_coeffs) { - output[i] = reducer.initialize(); - } - __syncthreads(); - } - - for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { - const Index row = 2 * (i / input_col_blocks); // everybody takes 2 rows - - if (row + 1 < num_preserved_coeffs) { - const Index col_block = i % input_col_blocks; - const Index col_begin = - packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x); - - PacketType reduced_val1 = reducer.template initializePacket<PacketType>(); - PacketType reduced_val2 = reducer.template initializePacket<PacketType>(); - - for (Index j = 0; j < NumPerThread; j += unroll_times) { - const Index last_col = - col_begin + blockDim.x * (j + unroll_times - 1) * packet_width; - if (last_col >= num_coeffs_to_reduce) { - Index col = col_begin + blockDim.x * j; - for (; col + packet_width <= num_coeffs_to_reduce; - col += blockDim.x) { - const PacketType val1 = input.m_impl.template packet<Unaligned>( - row * num_coeffs_to_reduce + col); - reducer.reducePacket(val1, &reduced_val1); - const PacketType val2 = input.m_impl.template packet<Unaligned>( - (row + 1) * num_coeffs_to_reduce + col); - reducer.reducePacket(val2, &reduced_val2); - } - if (col < num_coeffs_to_reduce) { - PacketType r1 = reducer.template initializePacket<PacketType>(); - PacketType r2 = reducer.template initializePacket<PacketType>(); - half2* hr1 = reinterpret_cast<half2*>(&r1); - half2* hr2 = reinterpret_cast<half2*>(&r2); - while (col + 1 < num_coeffs_to_reduce) { - *hr1 = __halves2half2( - input.m_impl.coeff(row * num_coeffs_to_reduce + col), - input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1)); - *hr2 = __halves2half2( - input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col), - input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col + - 1)); - hr1++; - hr2++; - col += 2; - } - if (col < num_coeffs_to_reduce) { - // Peel; - const half last1 = - input.m_impl.coeff(row * num_coeffs_to_reduce + col); - *hr1 = __halves2half2(last1, reducer.initialize()); - const half last2 = - input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col); - *hr2 = __halves2half2(last2, reducer.initialize()); - } - reducer.reducePacket(r1, &reduced_val1); - reducer.reducePacket(r2, &reduced_val2); - } - break; - } else { - // Faster version of the loop with no branches after unrolling. -#pragma unroll - for (int k = 0; k < unroll_times; ++k) { - const Index col = col_begin + blockDim.x * (j + k) * packet_width; - reducer.reducePacket(input.m_impl.template packet<Unaligned>( - row * num_coeffs_to_reduce + col), - &reduced_val1); - reducer.reducePacket(input.m_impl.template packet<Unaligned>( - (row + 1) * num_coeffs_to_reduce + col), - &reduced_val2); - } - } - } - -#pragma unroll - for (int offset = warpSize/2; offset > 0; offset /= 2) { - #if defined(EIGEN_HIPCC) - PacketType r1; - PacketType r2; - half2* hr1 = reinterpret_cast<half2*>(&r1); - half2* hr2 = reinterpret_cast<half2*>(&r2); - half2* rv1 = reinterpret_cast<half2*>(&reduced_val1); - half2* rv2 = reinterpret_cast<half2*>(&reduced_val2); - for (int i = 0; i < packet_width / 2; i++) { - // FIXME : remove this workaround once we have native half/half2 support for __shfl_down - union { int i; half2 h; } wka_in1, wka_out1; - wka_in1.h = rv1[i]; - wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize); - hr1[i] = wka_out1.h; - - union { int i; half2 h; } wka_in2, wka_out2; - wka_in2.h = rv2[i]; - wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize); - hr2[i] = wka_out2.h; - } - reducer.reducePacket(r1, &reduced_val1); - reducer.reducePacket(r2, &reduced_val2); - #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 - PacketType r1; - PacketType r2; - half2* hr1 = reinterpret_cast<half2*>(&r1); - half2* hr2 = reinterpret_cast<half2*>(&r2); - half2* rv1 = reinterpret_cast<half2*>(&reduced_val1); - half2* rv2 = reinterpret_cast<half2*>(&reduced_val2); - for (int i = 0; i < packet_width / 2; i++) { - hr1[i] = __shfl_down(rv1[i], offset, warpSize); - hr2[i] = __shfl_down(rv2[i], offset, warpSize); - } - reducer.reducePacket(r1, &reduced_val1); - reducer.reducePacket(r2, &reduced_val2); - #else - PacketType r1; - PacketType r2; - half2* hr1 = reinterpret_cast<half2*>(&r1); - half2* hr2 = reinterpret_cast<half2*>(&r2); - half2* rr1 = reinterpret_cast<half2*>(&reduced_val1); - half2* rr2 = reinterpret_cast<half2*>(&reduced_val2); - for (int i = 0; i < packet_width / 2; i++) { - hr1[i] = - __shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize); - hr2[i] = - __shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize); - } - reducer.reducePacket(r1, &reduced_val1); - reducer.reducePacket(r2, &reduced_val2); - - #endif - } - half2* rv1 = reinterpret_cast<half2*>(&reduced_val1); - half2* rv2 = reinterpret_cast<half2*>(&reduced_val2); - half2 val; - if (packet_width > 2) { - reducer.reducePacket(rv1[2], rv1); - reducer.reducePacket(rv1[3], rv1 + 1); - reducer.reducePacket(rv1[1], rv1); - reducer.reducePacket(rv2[2], rv2); - reducer.reducePacket(rv2[3], rv2 + 1); - reducer.reducePacket(rv2[1], rv2); - } - half val1 = __low2half(*rv1); - reducer.reduce(__high2half(*rv1), &val1); - half val2 = __low2half(*rv2); - reducer.reduce(__high2half(*rv2), &val2); - val = __halves2half2(val1, val2); - if ((threadIdx.x & (warpSize - 1)) == 0) { - half* loc = output + row; - atomicReduce((half2*)loc, val, reducer); - } - } - } -} - -#endif // EIGEN_HAS_GPU_FP16 - -template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> -struct InnerReductionLauncher { - static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) { - gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device"); - return true; - } -}; - -// Specialization for float and double -template <typename Self, typename Op, typename OutputType, bool PacketAccess> -struct InnerReductionLauncher< - Self, Op, OutputType, PacketAccess, - typename internal::enable_if< - internal::is_same<float, OutputType>::value || - internal::is_same<double, OutputType>::value, - void>::type> { - static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { - typedef typename Self::Index Index; - - const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; - const int block_size = 256; - const int num_per_thread = 128; - const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); - const int max_blocks = device.getNumGpuMultiProcessors() * - device.maxGpuThreadsPerMultiProcessor() / block_size; - const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); - - if (num_blocks > 1) { - // We initialize the outputs outside the reduction kernel when we can't be sure that there - // won't be a race conditions between multiple thread blocks. - const int dyn_blocks = divup<int>(num_preserved_vals, 1024); - const int max_blocks = device.getNumGpuMultiProcessors() * - device.maxGpuThreadsPerMultiProcessor() / 1024; - const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); - LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>), - num_blocks, 1024, 0, device, reducer.initialize(), - num_preserved_vals, output); - } - - LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>), - num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); - - return false; - } -}; - -#ifdef EIGEN_HAS_GPU_FP16 -template <typename Self, typename Op> -struct InnerReductionLauncher<Self, Op, Eigen::half, false> { - static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) { - gpu_assert(false && "Should not be called since there is no packet accessor"); - return true; - } -}; - -template <typename Self, typename Op> -struct InnerReductionLauncher<Self, Op, Eigen::half, true> { - static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { - typedef typename Self::Index Index; - - if (num_preserved_vals % 2 != 0) { - // Not supported yet, revert to the slower code path - return true; - } - - const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; - const int block_size = /*256*/128; - const int num_per_thread = /*128*/64; - const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); - const int max_blocks = device.getNumGpuMultiProcessors() * - device.maxGpuThreadsPerMultiProcessor() / block_size; - const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); - - if (num_blocks > 1) { - // We initialize the outputs outside the reduction kernel when we can't be sure that there - // won't be a race conditions between multiple thread blocks. - LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>), - 1, 1, 0, device, reducer, self, num_preserved_vals, output); - } - - LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>), - num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); - - return false; - } -}; -#endif // EIGEN_HAS_GPU_FP16 - - -template <typename Self, typename Op> -struct InnerReducer<Self, Op, GpuDevice> { - // Unfortunately nvidia doesn't support well exotic types such as complex, - // so reduce the scope of the optimized version of the code to the simple case - // of floats and half floats. -#ifdef EIGEN_HAS_GPU_FP16 - static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && - (internal::is_same<typename Self::CoeffReturnType, float>::value || - internal::is_same<typename Self::CoeffReturnType, double>::value || - (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); -#else // EIGEN_HAS_GPU_FP16 - static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && - (internal::is_same<typename Self::CoeffReturnType, float>::value || - internal::is_same<typename Self::CoeffReturnType, double>::value); -#endif // EIGEN_HAS_GPU_FP16 - - template <typename OutputType> - static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { - gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); - const Index num_coeffs = array_prod(self.m_impl.dimensions()); - // Don't crash when we're called with an input tensor of size 0. - if (num_coeffs == 0) { - return true; - } - // It's faster to use the usual code. - if (num_coeffs_to_reduce <= 128) { - return true; - } - - return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals); - } -}; - -template <int NumPerThread, typename Self, - typename Reducer, typename Index> -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, - typename Self::CoeffReturnType* output) { - const Index num_threads = blockDim.x * gridDim.x; - const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; - // Initialize the output values if they weren't initialized by the ReductionInitKernel - if (gridDim.x == 1) { - for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { - output[i] = reducer.initialize(); - } - __syncthreads(); - } - - // Do the reduction. - const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread); - for (Index i = thread_id; i < max_iter; i += num_threads) { - const Index input_col = i % num_preserved_coeffs; - const Index input_row = (i / num_preserved_coeffs) * NumPerThread; - typename Self::CoeffReturnType reduced_val = reducer.initialize(); - const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce); - for (Index j = input_row; j < max_row; j++) { - typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col); - reducer.reduce(val, &reduced_val); - } - atomicReduce(&(output[input_col]), reduced_val, reducer); - } -} - - -template <typename Self, typename Op> -struct OuterReducer<Self, Op, GpuDevice> { - // Unfortunately nvidia doesn't support well exotic types such as complex, - // so reduce the scope of the optimized version of the code to the simple case - // of floats. - static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && - (internal::is_same<typename Self::CoeffReturnType, float>::value || - internal::is_same<typename Self::CoeffReturnType, double>::value); - template <typename Device, typename OutputType> - static - #if !defined(EIGEN_HIPCC) - // FIXME : leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error - // (in the cxx11_tensor_reduction_gpu test) - // - // terminate called after throwing an instance of 'std::runtime_error' - // what(): No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL... - // - // don't know why this happens (and why is it a runtime error instead of a compile time error) - // - // this will be fixed by HIP PR#457 - EIGEN_DEVICE_FUNC - #endif - bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { - gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device"); - return true; - } - - static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { - typedef typename Self::Index Index; - - // It's faster to use the usual code. - if (num_coeffs_to_reduce <= 32) { - return true; - } - - const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; - const int block_size = 256; - const int num_per_thread = 16; - const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); - const int max_blocks = device.getNumGpuMultiProcessors() * - device.maxGpuThreadsPerMultiProcessor() / block_size; - const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); - - if (num_blocks > 1) { - // We initialize the outputs in the reduction kernel itself when we don't have to worry - // about race conditions between multiple thread blocks. - const int dyn_blocks = divup<int>(num_preserved_vals, 1024); - const int max_blocks = device.getNumGpuMultiProcessors() * - device.maxGpuThreadsPerMultiProcessor() / 1024; - const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); - LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>), - num_blocks, 1024, 0, device, reducer.initialize(), - num_preserved_vals, output); - } - - LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>), - num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); - - return false; - } -}; - -#endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) - - -} // end namespace internal -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionSycl.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionSycl.h deleted file mode 100644 index 474eba0..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionSycl.h +++ /dev/null @@ -1,582 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorReductionSycl.h - * - * \brief: - * This is the specialization of the reduction operation. Two phase reduction approach - * is used since the GPU does not have Global Synchronization for global memory among - * different work-group/thread block. To solve the problem, we need to create two kernels - * to reduce the data, where the first kernel reduce the data locally and each local - * workgroup/thread-block save the input data into global memory. In the second phase (global reduction) - * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element. - * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU: - * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf - * - *****************************************************************/ - -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP -namespace Eigen { -namespace TensorSycl { -namespace internal { - -template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable> -struct OpDefiner { - typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType; - typedef Op type; - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; } - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, - const Index &) { - return accumulator; - } -}; - -template <typename CoeffReturnType, typename Index> -struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> { - typedef Eigen::internal::SumReducer<CoeffReturnType> type; - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) { - return type(); - } - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator, - const Index &scale) { - ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op; - return quotient_op(accumulator, CoeffReturnType(scale)); - } -}; - -template <typename CoeffReturnType, typename Index> -struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> { - typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType; - typedef Eigen::internal::SumReducer<CoeffReturnType> type; - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) { - return type(); - } - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, - const Index &scale) { - return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale))); - } -}; - -template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index, - Index local_range> -struct SecondStepFullReducer { - typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> - LocalAccessor; - typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef; - typedef typename OpDef::type Op; - LocalAccessor scratch; - InputAccessor aI; - OutputAccessor outAcc; - Op op; - SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_) - : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {} - - void operator()(cl::sycl::nd_item<1> itemID) { - // Our empirical research shows that the best performance will be achieved - // when there is only one element per thread to reduce in the second step. - // in this step the second step reduction time is almost negligible. - // Hence, in the second step of reduction the input size is fixed to the - // local size, thus, there is only one element read per thread. The - // algorithm must be changed if the number of reduce per thread in the - // second step is greater than 1. Otherwise, the result will be wrong. - const Index localid = itemID.get_local_id(0); - auto aInPtr = aI.get_pointer() + localid; - auto aOutPtr = outAcc.get_pointer(); - CoeffReturnType *scratchptr = scratch.get_pointer(); - CoeffReturnType accumulator = *aInPtr; - - scratchptr[localid] = op.finalize(accumulator); - for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) { - itemID.barrier(cl::sycl::access::fence_space::local_space); - if (localid < offset) { - op.reduce(scratchptr[localid + offset], &accumulator); - scratchptr[localid] = op.finalize(accumulator); - } - } - if (localid == 0) *aOutPtr = op.finalize(accumulator); - } -}; - -// Full reduction first phase. In this version the vectorization is true and the reduction accept -// any generic reducerOp e.g( max, min, sum, mean, iamax, iamin, etc ). -template <typename Evaluator, typename OpType, typename Evaluator::Index local_range> -class FullReductionKernelFunctor { - public: - typedef typename Evaluator::CoeffReturnType CoeffReturnType; - typedef typename Evaluator::Index Index; - typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index, - (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)> - OpDef; - - typedef typename OpDef::type Op; - typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType; - typedef typename Evaluator::PacketReturnType PacketReturnType; - typedef - typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess), - PacketReturnType, CoeffReturnType>::type OutType; - typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> - LocalAccessor; - LocalAccessor scratch; - Evaluator evaluator; - EvaluatorPointerType final_output; - Index rng; - Op op; - - FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_, - Index rng_, OpType op_) - : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {} - - void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); } - - template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<Vect>::type compute_reduction( - const cl::sycl::nd_item<1> &itemID) { - auto output_ptr = final_output.get_pointer(); - Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize; - Index globalid = itemID.get_global_id(0); - Index localid = itemID.get_local_id(0); - Index step = Evaluator::PacketSize * itemID.get_global_range(0); - Index start = Evaluator::PacketSize * globalid; - // vectorizable parts - PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>(); - for (Index i = start; i < VectorizedRange; i += step) { - op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator); - } - globalid += VectorizedRange; - // non vectorizable parts - for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) { - op.template reducePacket<PacketReturnType>( - ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type( - evaluator.impl().coeff(i), op.initialize()), - &packetAccumulator); - } - scratch[localid] = packetAccumulator = - OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng); - // reduction parts // Local size is always power of 2 - EIGEN_UNROLL_LOOP - for (Index offset = local_range / 2; offset > 0; offset /= 2) { - itemID.barrier(cl::sycl::access::fence_space::local_space); - if (localid < offset) { - op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator); - scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator); - } - } - if (localid == 0) { - output_ptr[itemID.get_group(0)] = - op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator)); - } - } - - template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!Vect>::type compute_reduction( - const cl::sycl::nd_item<1> &itemID) { - auto output_ptr = final_output.get_pointer(); - Index globalid = itemID.get_global_id(0); - Index localid = itemID.get_local_id(0); - // vectorizable parts - CoeffReturnType accumulator = op.initialize(); - // non vectorizable parts - for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) { - op.reduce(evaluator.impl().coeff(i), &accumulator); - } - scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng); - - // reduction parts. the local size is always power of 2 - EIGEN_UNROLL_LOOP - for (Index offset = local_range / 2; offset > 0; offset /= 2) { - itemID.barrier(cl::sycl::access::fence_space::local_space); - if (localid < offset) { - op.reduce(scratch[localid + offset], &accumulator); - scratch[localid] = op.finalize(accumulator); - } - } - if (localid == 0) { - output_ptr[itemID.get_group(0)] = op.finalize(accumulator); - } - } -}; - -template <typename Evaluator, typename OpType> -class GenericNondeterministicReducer { - public: - typedef typename Evaluator::CoeffReturnType CoeffReturnType; - typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType; - typedef typename Evaluator::Index Index; - typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef; - typedef typename OpDef::type Op; - template <typename Scratch> - GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_, - Index range_, Index num_values_to_reduce_) - : evaluator(evaluator_), - output_accessor(output_accessor_), - functor(OpDef::get_op(functor_)), - range(range_), - num_values_to_reduce(num_values_to_reduce_) {} - - void operator()(cl::sycl::nd_item<1> itemID) { - auto output_accessor_ptr = output_accessor.get_pointer(); - /// const cast added as a naive solution to solve the qualifier drop error - Index globalid = static_cast<Index>(itemID.get_global_linear_id()); - if (globalid < range) { - CoeffReturnType accum = functor.initialize(); - Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce( - evaluator, evaluator.firstInput(globalid), functor, &accum); - output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce); - } - } - - private: - Evaluator evaluator; - EvaluatorPointerType output_accessor; - Op functor; - Index range; - Index num_values_to_reduce; -}; - -enum class reduction_dim { inner_most, outer_most }; -// default is preserver -template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt> -struct PartialReductionKernel { - typedef typename Evaluator::CoeffReturnType CoeffReturnType; - typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType; - typedef typename Evaluator::Index Index; - typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef; - typedef typename OpDef::type Op; - typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> - ScratchAcc; - ScratchAcc scratch; - Evaluator evaluator; - EvaluatorPointerType output_accessor; - Op op; - const Index preserve_elements_num_groups; - const Index reduce_elements_num_groups; - const Index num_coeffs_to_preserve; - const Index num_coeffs_to_reduce; - - PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_, - const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_, - const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_) - : scratch(scratch_), - evaluator(evaluator_), - output_accessor(output_accessor_), - op(OpDef::get_op(op_)), - preserve_elements_num_groups(preserve_elements_num_groups_), - reduce_elements_num_groups(reduce_elements_num_groups_), - num_coeffs_to_preserve(num_coeffs_to_preserve_), - num_coeffs_to_reduce(num_coeffs_to_reduce_) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId, - CoeffReturnType &accumulator) { - if (globalPId >= num_coeffs_to_preserve) { - return; - } - Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve) - : globalRId + (globalPId * num_coeffs_to_reduce); - Index localOffset = globalRId; - - const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups; - const Index per_thread_global_stride = - rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride; - for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) { - op.reduce(evaluator.impl().coeff(global_offset), &accumulator); - localOffset += per_thread_local_stride; - global_offset += per_thread_global_stride; - } - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { - const Index linearLocalThreadId = itemID.get_local_id(0); - Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP - : linearLocalThreadId / PannelParameters::LocalThreadSizeR; - Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP - : linearLocalThreadId % PannelParameters::LocalThreadSizeR; - const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups - : itemID.get_group(0) / reduce_elements_num_groups; - const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups - : itemID.get_group(0) % reduce_elements_num_groups; - - Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId; - const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId; - auto scratchPtr = scratch.get_pointer().get(); - auto outPtr = - output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0); - CoeffReturnType accumulator = op.initialize(); - - element_wise_reduce(globalRId, globalPId, accumulator); - - accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce); - scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] = - accumulator; - if (rt == reduction_dim::inner_most) { - pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP; - rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP; - globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId; - } - - /* Apply the reduction operation between the current local - * id and the one on the other half of the vector. */ - auto out_scratch_ptr = - scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC))); - itemID.barrier(cl::sycl::access::fence_space::local_space); - if (rt == reduction_dim::inner_most) { - accumulator = *out_scratch_ptr; - } - // The Local LocalThreadSizeR is always power of 2 - EIGEN_UNROLL_LOOP - for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) { - if (rLocalThreadId < offset) { - op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator); - // The result has already been divided for mean reducer in the - // previous reduction so no need to divide furthermore - *out_scratch_ptr = op.finalize(accumulator); - } - /* All threads collectively read from global memory into local. - * The barrier ensures all threads' IO is resolved before - * execution continues (strictly speaking, all threads within - * a single work-group - there is no co-ordination between - * work-groups, only work-items). */ - itemID.barrier(cl::sycl::access::fence_space::local_space); - } - - if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) { - outPtr[globalPId] = op.finalize(accumulator); - } - } -}; - -template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType> -struct SecondStepPartialReduction { - typedef OpDefiner<OpType, OutScalar, Index, false> OpDef; - typedef typename OpDef::type Op; - typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> - ScratchAccessor; - InputAccessor input_accessor; - OutputAccessor output_accessor; - Op op; - const Index num_coeffs_to_preserve; - const Index num_coeffs_to_reduce; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_, - OutputAccessor output_accessor_, OpType op_, - const Index num_coeffs_to_preserve_, - const Index num_coeffs_to_reduce_) - : input_accessor(input_accessor_), - output_accessor(output_accessor_), - op(OpDef::get_op(op_)), - num_coeffs_to_preserve(num_coeffs_to_preserve_), - num_coeffs_to_reduce(num_coeffs_to_reduce_) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { - const Index globalId = itemID.get_global_id(0); - - if (globalId >= num_coeffs_to_preserve) return; - - auto in_ptr = input_accessor.get_pointer() + globalId; - - OutScalar accumulator = op.initialize(); -// num_coeffs_to_reduce is not bigger that 256 - for (Index i = 0; i < num_coeffs_to_reduce; i++) { - op.reduce(*in_ptr, &accumulator); - in_ptr += num_coeffs_to_preserve; - } - output_accessor.get_pointer()[globalId] = op.finalize(accumulator); - } -}; // namespace internal - -template <typename Index, Index LTP, Index LTR, bool BC_> -struct ReductionPannel { - static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP; - static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR; - static EIGEN_CONSTEXPR bool BC = BC_; -}; - -template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt> -struct PartialReducerLauncher { - typedef typename Self::EvaluatorPointerType EvaluatorPointerType; - typedef typename Self::CoeffReturnType CoeffReturnType; - typedef typename Self::Storage Storage; - typedef typename Self::Index Index; - typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true> - PannelParameters; - - typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType; - - static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output, - Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) { - Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP); - - // getPowerOfTwo makes sure local range is power of 2 and <= - // maxSyclThreadPerBlock this will help us to avoid extra check on the - // kernel - static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) & - (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)), - "The Local thread size must be a power of 2 for the reduction " - "operation"); - - EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR; - // In this step, we force the code not to be more than 2-step reduction: - // Our empirical research shows that if each thread reduces at least 64 - // elemnts individually, we get better performance. However, this can change - // on different platforms. In this step we force the code not to be - // morthan step reduction: Our empirical research shows that for inner_most - // dim reducer, it is better to have 8 group in a reduce dimension for sizes - // > 1024 to achieve the best performance. - const Index reductionPerThread = 64; - Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true); - const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP; - Index rGroups = (cu + pNumGroups - 1) / pNumGroups; - const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1; - const Index globalRange = pNumGroups * rNumGroups * localRange; - - EIGEN_CONSTEXPR Index scratchSize = - PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC); - auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); - if (rNumGroups > 1) { - CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>( - dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType))); - EvaluatorPointerType temp_accessor = dev.get(temp_pointer); - dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>( - self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve, - num_coeffs_to_reduce); - - typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op> - SecondStepPartialReductionKernel; - - dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>( - temp_accessor, output, - cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1), - reducer, num_coeffs_to_preserve, rNumGroups); - - self.device().deallocate_temp(temp_pointer); - } else { - dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>( - self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve, - num_coeffs_to_reduce); - } - return false; - } -}; -} // namespace internal -} // namespace TensorSycl - -namespace internal { - -template <typename Self, typename Op, bool Vectorizable> -struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> { - typedef typename Self::CoeffReturnType CoeffReturnType; - typedef typename Self::EvaluatorPointerType EvaluatorPointerType; - static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; - static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1; - static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) { - typedef typename conditional<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType>::type OutType; - static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) & - (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)), - "The Local thread size must be a power of 2 for the reduction " - "operation"); - EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1; - - typename Self::Index inputSize = self.impl().dimensions().TotalSize(); - // In this step we force the code not to be more than 2-step reduction: - // Our empirical research shows that if each thread reduces at least 512 - // elemnts individually, we get better performance. - const Index reductionPerThread = 2048; - // const Index num_work_group = - Index reductionGroup = dev.getPowerOfTwo( - (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true); - const Index num_work_group = std::min(reductionGroup, local_range); - // 1 - // ? local_range - // : 1); - const Index global_range = num_work_group * local_range; - - auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); - typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t; - if (num_work_group > 1) { - CoeffReturnType *temp_pointer = - static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType))); - typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer); - dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range, - local_range, inputSize, reducer); - - typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType, - EvaluatorPointerType, Index, local_range> - GenericRKernel; - dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>( - tmp_global_accessor, data, - cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group, - reducer); - - dev.deallocate_temp(temp_pointer); - } else { - dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize, - reducer); - } - } -}; -// vectorizable inner_most most dim preserver -// col reduction -template <typename Self, typename Op> -struct OuterReducer<Self, Op, Eigen::SyclDevice> { - static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; - - static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, - typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, - typename Self::Index num_coeffs_to_preserve) { - return ::Eigen::TensorSycl::internal::PartialReducerLauncher< - Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output, - num_coeffs_to_reduce, - num_coeffs_to_preserve); - } -}; -// row reduction -template <typename Self, typename Op> -struct InnerReducer<Self, Op, Eigen::SyclDevice> { - static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; - - static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, - typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, - typename Self::Index num_coeffs_to_preserve) { - return ::Eigen::TensorSycl::internal::PartialReducerLauncher< - Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output, - num_coeffs_to_reduce, - num_coeffs_to_preserve); - } -}; - -// ArmgMax uses this kernel for partial reduction// -// TODO(@mehdi.goli) come up with a better kernel -// generic partial reduction -template <typename Self, typename Op> -struct GenericReducer<Self, Op, Eigen::SyclDevice> { - static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false; - static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, - typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce, - typename Self::Index num_coeffs_to_preserve) { - typename Self::Index range, GRange, tileSize; - dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange); - - dev.template unary_kernel_launcher<typename Self::CoeffReturnType, - TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>( - self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1), - reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1)); - return false; - } -}; - -} // namespace internal -} // namespace Eigen - -#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorRef.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorRef.h deleted file mode 100644 index a27d364..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorRef.h +++ /dev/null @@ -1,454 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_REF_H -#define EIGEN_CXX11_TENSOR_TENSOR_REF_H - -namespace Eigen { - -namespace internal { - -template <typename Dimensions, typename Scalar> -class TensorLazyBaseEvaluator { - public: - TensorLazyBaseEvaluator() : m_refcount(0) { } - virtual ~TensorLazyBaseEvaluator() { } - - EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const = 0; - EIGEN_DEVICE_FUNC virtual const Scalar* data() const = 0; - - EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const = 0; - EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) = 0; - - void incrRefCount() { ++m_refcount; } - void decrRefCount() { --m_refcount; } - int refCount() const { return m_refcount; } - - private: - // No copy, no assignment; - TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other); - TensorLazyBaseEvaluator& operator = (const TensorLazyBaseEvaluator& other); - - int m_refcount; -}; - - -template <typename Dimensions, typename Expr, typename Device> -class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator<Dimensions, typename TensorEvaluator<Expr, Device>::Scalar> { - public: - // typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions; - typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar; - typedef StorageMemory<Scalar, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - typedef TensorEvaluator<Expr, Device> EvalType; - - TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device), m_dummy(Scalar(0)) { - m_dims = m_impl.dimensions(); - m_impl.evalSubExprsIfNeeded(NULL); - } - virtual ~TensorLazyEvaluatorReadOnly() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const { - return m_dims; - } - EIGEN_DEVICE_FUNC virtual const Scalar* data() const { - return m_impl.data(); - } - - EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const { - return m_impl.coeff(index); - } - EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex /*index*/) { - eigen_assert(false && "can't reference the coefficient of a rvalue"); - return m_dummy; - }; - - protected: - TensorEvaluator<Expr, Device> m_impl; - Dimensions m_dims; - Scalar m_dummy; -}; - -template <typename Dimensions, typename Expr, typename Device> -class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> { - public: - typedef TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> Base; - typedef typename Base::Scalar Scalar; - typedef StorageMemory<Scalar, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) { - } - virtual ~TensorLazyEvaluatorWritable() { - } - - EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) { - return this->m_impl.coeffRef(index); - } -}; - -template <typename Dimensions, typename Expr, typename Device> -class TensorLazyEvaluator : public internal::conditional<bool(internal::is_lvalue<Expr>::value), - TensorLazyEvaluatorWritable<Dimensions, Expr, Device>, - TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type { - public: - typedef typename internal::conditional<bool(internal::is_lvalue<Expr>::value), - TensorLazyEvaluatorWritable<Dimensions, Expr, Device>, - TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type Base; - typedef typename Base::Scalar Scalar; - - TensorLazyEvaluator(const Expr& expr, const Device& device) : Base(expr, device) { - } - virtual ~TensorLazyEvaluator() { - } -}; - -} // namespace internal - - -/** \class TensorRef - * \ingroup CXX11_Tensor_Module - * - * \brief A reference to a tensor expression - * The expression will be evaluated lazily (as much as possible). - * - */ -template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef<PlainObjectType> > -{ - public: - typedef TensorRef<PlainObjectType> Self; - typedef typename PlainObjectType::Base Base; - typedef typename Eigen::internal::nested<Self>::type Nested; - typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind; - typedef typename internal::traits<PlainObjectType>::Index Index; - typedef typename internal::traits<PlainObjectType>::Scalar Scalar; - typedef typename NumTraits<Scalar>::Real RealScalar; - typedef typename Base::CoeffReturnType CoeffReturnType; - typedef Scalar* PointerType; - typedef PointerType PointerArgType; - - static const Index NumIndices = PlainObjectType::NumIndices; - typedef typename PlainObjectType::Dimensions Dimensions; - - enum { - IsAligned = false, - PacketAccess = false, - BlockAccess = false, - PreferBlockAccess = false, - Layout = PlainObjectType::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -----------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) { - } - - template <typename Expression> - EIGEN_STRONG_INLINE TensorRef(const Expression& expr) : m_evaluator(new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice())) { - m_evaluator->incrRefCount(); - } - - template <typename Expression> - EIGEN_STRONG_INLINE TensorRef& operator = (const Expression& expr) { - unrefEvaluator(); - m_evaluator = new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice()); - m_evaluator->incrRefCount(); - return *this; - } - - ~TensorRef() { - unrefEvaluator(); - } - - TensorRef(const TensorRef& other) : m_evaluator(other.m_evaluator) { - eigen_assert(m_evaluator->refCount() > 0); - m_evaluator->incrRefCount(); - } - - TensorRef& operator = (const TensorRef& other) { - if (this != &other) { - unrefEvaluator(); - m_evaluator = other.m_evaluator; - eigen_assert(m_evaluator->refCount() > 0); - m_evaluator->incrRefCount(); - } - return *this; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index rank() const { return m_evaluator->dimensions().size(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_evaluator->dimensions()[n]; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_evaluator->dimensions(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index size() const { return m_evaluator->dimensions().TotalSize(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar* data() const { return m_evaluator->data(); } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar operator()(Index index) const - { - return m_evaluator->coeff(index); - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template<typename... IndexTypes> EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const - { - const std::size_t num_indices = (sizeof...(otherIndices) + 1); - const array<Index, num_indices> indices{{firstIndex, otherIndices...}}; - return coeff(indices); - } - template<typename... IndexTypes> EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) - { - const std::size_t num_indices = (sizeof...(otherIndices) + 1); - const array<Index, num_indices> indices{{firstIndex, otherIndices...}}; - return coeffRef(indices); - } -#else - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1) const - { - array<Index, 2> indices; - indices[0] = i0; - indices[1] = i1; - return coeff(indices); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2) const - { - array<Index, 3> indices; - indices[0] = i0; - indices[1] = i1; - indices[2] = i2; - return coeff(indices); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3) const - { - array<Index, 4> indices; - indices[0] = i0; - indices[1] = i1; - indices[2] = i2; - indices[3] = i3; - return coeff(indices); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const - { - array<Index, 5> indices; - indices[0] = i0; - indices[1] = i1; - indices[2] = i2; - indices[3] = i3; - indices[4] = i4; - return coeff(indices); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1) - { - array<Index, 2> indices; - indices[0] = i0; - indices[1] = i1; - return coeffRef(indices); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2) - { - array<Index, 3> indices; - indices[0] = i0; - indices[1] = i1; - indices[2] = i2; - return coeffRef(indices); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) - { - array<Index, 4> indices; - indices[0] = i0; - indices[1] = i1; - indices[2] = i2; - indices[3] = i3; - return coeffRef(indices); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2, Index i3, Index i4) - { - array<Index, 5> indices; - indices[0] = i0; - indices[1] = i1; - indices[2] = i2; - indices[3] = i3; - indices[4] = i4; - return coeffRef(indices); - } -#endif - - template <std::size_t NumIndices> EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar coeff(const array<Index, NumIndices>& indices) const - { - const Dimensions& dims = this->dimensions(); - Index index = 0; - if (PlainObjectType::Options & RowMajor) { - index += indices[0]; - for (size_t i = 1; i < NumIndices; ++i) { - index = index * dims[i] + indices[i]; - } - } else { - index += indices[NumIndices-1]; - for (int i = NumIndices-2; i >= 0; --i) { - index = index * dims[i] + indices[i]; - } - } - return m_evaluator->coeff(index); - } - template <std::size_t NumIndices> EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices) - { - const Dimensions& dims = this->dimensions(); - Index index = 0; - if (PlainObjectType::Options & RowMajor) { - index += indices[0]; - for (size_t i = 1; i < NumIndices; ++i) { - index = index * dims[i] + indices[i]; - } - } else { - index += indices[NumIndices-1]; - for (int i = NumIndices-2; i >= 0; --i) { - index = index * dims[i] + indices[i]; - } - } - return m_evaluator->coeffRef(index); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar coeff(Index index) const - { - return m_evaluator->coeff(index); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) - { - return m_evaluator->coeffRef(index); - } - - private: - EIGEN_STRONG_INLINE void unrefEvaluator() { - if (m_evaluator) { - m_evaluator->decrRefCount(); - if (m_evaluator->refCount() == 0) { - delete m_evaluator; - } - } - } - - internal::TensorLazyBaseEvaluator<Dimensions, Scalar>* m_evaluator; -}; - - -// evaluator for rvalues -template<typename Derived, typename Device> -struct TensorEvaluator<const TensorRef<Derived>, Device> -{ - typedef typename Derived::Index Index; - typedef typename Derived::Scalar Scalar; - typedef typename Derived::Scalar CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef typename Derived::Dimensions Dimensions; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - IsAligned = false, - PacketAccess = false, - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorRef<Derived>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&) - : m_ref(m) - { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { - return true; - } - - EIGEN_STRONG_INLINE void cleanup() { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_ref.coeff(index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - return m_ref.coeffRef(index); - } - - EIGEN_DEVICE_FUNC const Scalar* data() const { return m_ref.data(); } - - protected: - TensorRef<Derived> m_ref; -}; - - -// evaluator for lvalues -template<typename Derived, typename Device> -struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<const TensorRef<Derived>, Device> -{ - typedef typename Derived::Index Index; - typedef typename Derived::Scalar Scalar; - typedef typename Derived::Scalar CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef typename Derived::Dimensions Dimensions; - - typedef TensorEvaluator<const TensorRef<Derived>, Device> Base; - - enum { - IsAligned = false, - PacketAccess = false, - BlockAccess = false, - PreferBlockAccess = false, - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d) - { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - return this->m_ref.coeffRef(index); - } -}; - - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_REF_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorReverse.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorReverse.h deleted file mode 100644 index 586ce68..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorReverse.h +++ /dev/null @@ -1,465 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com> -// Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H -#define EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H -namespace Eigen { - -/** \class TensorReverse - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor reverse elements class. - * - */ -namespace internal { -template<typename ReverseDimensions, typename XprType> -struct traits<TensorReverseOp<ReverseDimensions, - XprType> > : public traits<XprType> -{ - typedef typename XprType::Scalar Scalar; - typedef traits<XprType> XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; -}; - -template<typename ReverseDimensions, typename XprType> -struct eval<TensorReverseOp<ReverseDimensions, XprType>, Eigen::Dense> -{ - typedef const TensorReverseOp<ReverseDimensions, XprType>& type; -}; - -template<typename ReverseDimensions, typename XprType> -struct nested<TensorReverseOp<ReverseDimensions, XprType>, 1, - typename eval<TensorReverseOp<ReverseDimensions, XprType> >::type> -{ - typedef TensorReverseOp<ReverseDimensions, XprType> type; -}; - -} // end namespace internal - -template<typename ReverseDimensions, typename XprType> -class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions, - XprType>, WriteAccessors> -{ - public: - typedef TensorBase<TensorReverseOp<ReverseDimensions, XprType>, WriteAccessors>Base; - typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested; - typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind - StorageKind; - typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp( - const XprType& expr, const ReverseDimensions& reverse_dims) - : m_xpr(expr), m_reverse_dims(reverse_dims) { } - - EIGEN_DEVICE_FUNC - const ReverseDimensions& reverse() const { return m_reverse_dims; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_xpr; } - - EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReverseOp) - - - protected: - typename XprType::Nested m_xpr; - const ReverseDimensions m_reverse_dims; -}; - -// Eval as rvalue -template<typename ReverseDimensions, typename ArgType, typename Device> -struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device> -{ - typedef TensorReverseOp<ReverseDimensions, ArgType> XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size<ReverseDimensions>::value; - typedef DSizes<Index, NumDims> Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - IsAligned = false, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = NumDims > 0, - PreferBlockAccess = true, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - typedef internal::TensorIntDivisor<Index> IndexDivisor; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; - - typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock - ArgTensorBlock; - - typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims, - Layout, Index> - TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), - m_reverse(op.reverse()), - m_device(device) - { - // Reversing a scalar isn't supported yet. It would be a no-op anyway. - EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - - // Compute strides - m_dimensions = m_impl.dimensions(); - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - m_strides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_strides[i] = m_strides[i-1] * m_dimensions[i-1]; - if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]); - } - } else { - m_strides[NumDims-1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_strides[i] = m_strides[i+1] * m_dimensions[i+1]; - if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]); - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - -#ifdef EIGEN_USE_THREADS - template <typename EvalSubExprsCallback> - EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType, EvalSubExprsCallback done) { - m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); - } -#endif // EIGEN_USE_THREADS - - EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index reverseIndex( - Index index) const { - eigen_assert(index < dimensions().TotalSize()); - Index inputIndex = 0; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - EIGEN_UNROLL_LOOP - for (int i = NumDims - 1; i > 0; --i) { - Index idx = index / m_fastStrides[i]; - index -= idx * m_strides[i]; - if (m_reverse[i]) { - idx = m_dimensions[i] - idx - 1; - } - inputIndex += idx * m_strides[i] ; - } - if (m_reverse[0]) { - inputIndex += (m_dimensions[0] - index - 1); - } else { - inputIndex += index; - } - } else { - EIGEN_UNROLL_LOOP - for (int i = 0; i < NumDims - 1; ++i) { - Index idx = index / m_fastStrides[i]; - index -= idx * m_strides[i]; - if (m_reverse[i]) { - idx = m_dimensions[i] - idx - 1; - } - inputIndex += idx * m_strides[i] ; - } - if (m_reverse[NumDims-1]) { - inputIndex += (m_dimensions[NumDims-1] - index - 1); - } else { - inputIndex += index; - } - } - return inputIndex; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff( - Index index) const { - return m_impl.coeff(reverseIndex(index)); - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketReturnType packet(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - // TODO(ndjaitly): write a better packing routine that uses - // local structure. - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type - values[PacketSize]; - EIGEN_UNROLL_LOOP - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload<PacketReturnType>(values); - return rslt; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - const size_t target_size = m_device.lastLevelCacheSize(); - // Block evaluation reads underlying memory in reverse order, and default - // cost model does not properly catch this in bytes stored/loaded. - return internal::TensorBlockResourceRequirements::skewed<Scalar>( - target_size) - .addCostPerCoeff({0, 0, 24}); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - // TODO(ezhulenev): If underlying tensor expression supports and prefers - // block evaluation we must use it. Currently we use coeff and packet - // access into the underlying tensor expression. - // static const bool useBlockAccessForArgType = - // TensorEvaluator<ArgType, Device>::BlockAccess && - // TensorEvaluator<ArgType, Device>::PreferBlockAccess; - - static const bool isColMajor = - static_cast<int>(Layout) == static_cast<int>(ColMajor); - - static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1; - const bool inner_dim_reversed = m_reverse[inner_dim_idx]; - - // Offset in the output block. - Index block_offset = 0; - - // Offset in the input Tensor. - Index input_offset = reverseIndex(desc.offset()); - - // Initialize output block iterator state. Dimension in this array are - // always in inner_most -> outer_most order (col major layout). - array<BlockIteratorState, NumDims> it; - for (int i = 0; i < NumDims; ++i) { - const int dim = isColMajor ? i : NumDims - 1 - i; - it[i].size = desc.dimension(dim); - it[i].count = 0; - it[i].reverse = m_reverse[dim]; - - it[i].block_stride = - i == 0 ? 1 : (it[i - 1].size * it[i - 1].block_stride); - it[i].block_span = it[i].block_stride * (it[i].size - 1); - - it[i].input_stride = m_strides[dim]; - it[i].input_span = it[i].input_stride * (it[i].size - 1); - - if (it[i].reverse) { - it[i].input_stride = -1 * it[i].input_stride; - it[i].input_span = -1 * it[i].input_span; - } - } - - // If multiple inner dimensions have the same reverse flag, check if we can - // merge them into a single virtual inner dimension. - int effective_inner_dim = 0; - for (int i = 1; i < NumDims; ++i) { - if (it[i].reverse != it[effective_inner_dim].reverse) break; - if (it[i].block_stride != it[effective_inner_dim].size) break; - if (it[i].block_stride != numext::abs(it[i].input_stride)) break; - - it[i].size = it[effective_inner_dim].size * it[i].size; - - it[i].block_stride = 1; - it[i].input_stride = (inner_dim_reversed ? -1 : 1); - - it[i].block_span = it[i].block_stride * (it[i].size - 1); - it[i].input_span = it[i].input_stride * (it[i].size - 1); - - effective_inner_dim = i; - } - - eigen_assert(it[effective_inner_dim].block_stride == 1); - eigen_assert(it[effective_inner_dim].input_stride == - (inner_dim_reversed ? -1 : 1)); - - const Index inner_dim_size = it[effective_inner_dim].size; - - // Prepare storage for the materialized reverse result. - const typename TensorBlock::Storage block_storage = - TensorBlock::prepareStorage(desc, scratch); - CoeffReturnType* block_buffer = block_storage.data(); - - while (it[NumDims - 1].count < it[NumDims - 1].size) { - // Copy inner-most dimension data from reversed location in input. - Index dst = block_offset; - Index src = input_offset; - - // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed - // worse results in benchmarks than a simple coefficient loop. - if (inner_dim_reversed) { - for (Index i = 0; i < inner_dim_size; ++i) { - block_buffer[dst] = m_impl.coeff(src); - ++dst; - --src; - } - } else { - for (Index i = 0; i < inner_dim_size; ++i) { - block_buffer[dst] = m_impl.coeff(src); - ++dst; - ++src; - } - } - - // For the 1d tensor we need to generate only one inner-most dimension. - if ((NumDims - effective_inner_dim) == 1) break; - - // Update offset. - for (Index i = effective_inner_dim + 1; i < NumDims; ++i) { - if (++it[i].count < it[i].size) { - block_offset += it[i].block_stride; - input_offset += it[i].input_stride; - break; - } - if (i != NumDims - 1) it[i].count = 0; - block_offset -= it[i].block_span; - input_offset -= it[i].input_span; - } - } - - return block_storage.AsTensorMaterializedBlock(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() + - 2 * TensorOpCost::MulCost<Index>() + - TensorOpCost::DivCost<Index>()); - for (int i = 0; i < NumDims; ++i) { - if (m_reverse[i]) { - compute_cost += 2 * TensorOpCost::AddCost<Index>(); - } - } - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); - } - - EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - - protected: - Dimensions m_dimensions; - array<Index, NumDims> m_strides; - array<IndexDivisor, NumDims> m_fastStrides; - TensorEvaluator<ArgType, Device> m_impl; - ReverseDimensions m_reverse; - const Device EIGEN_DEVICE_REF m_device; - - private: - struct BlockIteratorState { - BlockIteratorState() - : size(0), - count(0), - reverse(false), - block_stride(0), - block_span(0), - input_stride(0), - input_span(0) {} - - Index size; - Index count; - bool reverse; - Index block_stride; - Index block_span; - Index input_stride; - Index input_span; - }; -}; - -// Eval as lvalue - -template <typename ReverseDimensions, typename ArgType, typename Device> -struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device> - : public TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, - Device> { - typedef TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, - Device> Base; - typedef TensorReverseOp<ReverseDimensions, ArgType> XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size<ReverseDimensions>::value; - typedef DSizes<Index, NumDims> Dimensions; - - enum { - IsAligned = false, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) {} - - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Dimensions& dimensions() const { return this->m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - return this->m_impl.coeffRef(this->reverseIndex(index)); - } - - template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - // This code is pilfered from TensorMorphing.h - EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; - internal::pstore<CoeffReturnType, PacketReturnType>(values, x); - EIGEN_UNROLL_LOOP - for (int i = 0; i < PacketSize; ++i) { - this->coeffRef(index+i) = values[i]; - } - } -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorScan.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorScan.h deleted file mode 100644 index beae854..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorScan.h +++ /dev/null @@ -1,528 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H -#define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H - -namespace Eigen { - -namespace internal { - -template <typename Op, typename XprType> -struct traits<TensorScanOp<Op, XprType> > - : public traits<XprType> { - typedef typename XprType::Scalar Scalar; - typedef traits<XprType> XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; -}; - -template<typename Op, typename XprType> -struct eval<TensorScanOp<Op, XprType>, Eigen::Dense> -{ - typedef const TensorScanOp<Op, XprType>& type; -}; - -template<typename Op, typename XprType> -struct nested<TensorScanOp<Op, XprType>, 1, - typename eval<TensorScanOp<Op, XprType> >::type> -{ - typedef TensorScanOp<Op, XprType> type; -}; -} // end namespace internal - -/** \class TensorScan - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor scan class. - */ -template <typename Op, typename XprType> -class TensorScanOp - : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> { -public: - typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested<TensorScanOp>::type Nested; - typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorScanOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp( - const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op()) - : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Index axis() const { return m_axis; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const XprType& expression() const { return m_expr; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Op accumulator() const { return m_accumulator; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - bool exclusive() const { return m_exclusive; } - -protected: - typename XprType::Nested m_expr; - const Index m_axis; - const Op m_accumulator; - const bool m_exclusive; -}; - - -namespace internal { - -template <typename Self> -EIGEN_STRONG_INLINE void ReduceScalar(Self& self, Index offset, - typename Self::CoeffReturnType* data) { - // Compute the scan along the axis, starting at the given offset - typename Self::CoeffReturnType accum = self.accumulator().initialize(); - if (self.stride() == 1) { - if (self.exclusive()) { - for (Index curr = offset; curr < offset + self.size(); ++curr) { - data[curr] = self.accumulator().finalize(accum); - self.accumulator().reduce(self.inner().coeff(curr), &accum); - } - } else { - for (Index curr = offset; curr < offset + self.size(); ++curr) { - self.accumulator().reduce(self.inner().coeff(curr), &accum); - data[curr] = self.accumulator().finalize(accum); - } - } - } else { - if (self.exclusive()) { - for (Index idx3 = 0; idx3 < self.size(); idx3++) { - Index curr = offset + idx3 * self.stride(); - data[curr] = self.accumulator().finalize(accum); - self.accumulator().reduce(self.inner().coeff(curr), &accum); - } - } else { - for (Index idx3 = 0; idx3 < self.size(); idx3++) { - Index curr = offset + idx3 * self.stride(); - self.accumulator().reduce(self.inner().coeff(curr), &accum); - data[curr] = self.accumulator().finalize(accum); - } - } - } -} - -template <typename Self> -EIGEN_STRONG_INLINE void ReducePacket(Self& self, Index offset, - typename Self::CoeffReturnType* data) { - using Scalar = typename Self::CoeffReturnType; - using Packet = typename Self::PacketReturnType; - // Compute the scan along the axis, starting at the calculated offset - Packet accum = self.accumulator().template initializePacket<Packet>(); - if (self.stride() == 1) { - if (self.exclusive()) { - for (Index curr = offset; curr < offset + self.size(); ++curr) { - internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum)); - self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum); - } - } else { - for (Index curr = offset; curr < offset + self.size(); ++curr) { - self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum); - internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum)); - } - } - } else { - if (self.exclusive()) { - for (Index idx3 = 0; idx3 < self.size(); idx3++) { - const Index curr = offset + idx3 * self.stride(); - internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum)); - self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum); - } - } else { - for (Index idx3 = 0; idx3 < self.size(); idx3++) { - const Index curr = offset + idx3 * self.stride(); - self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum); - internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum)); - } - } - } -} - -template <typename Self, bool Vectorize, bool Parallel> -struct ReduceBlock { - EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, - typename Self::CoeffReturnType* data) { - for (Index idx2 = 0; idx2 < self.stride(); idx2++) { - // Calculate the starting offset for the scan - Index offset = idx1 + idx2; - ReduceScalar(self, offset, data); - } - } -}; - -// Specialization for vectorized reduction. -template <typename Self> -struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/false> { - EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, - typename Self::CoeffReturnType* data) { - using Packet = typename Self::PacketReturnType; - const int PacketSize = internal::unpacket_traits<Packet>::size; - Index idx2 = 0; - for (; idx2 + PacketSize <= self.stride(); idx2 += PacketSize) { - // Calculate the starting offset for the packet scan - Index offset = idx1 + idx2; - ReducePacket(self, offset, data); - } - for (; idx2 < self.stride(); idx2++) { - // Calculate the starting offset for the scan - Index offset = idx1 + idx2; - ReduceScalar(self, offset, data); - } - } -}; - -// Single-threaded CPU implementation of scan -template <typename Self, typename Reducer, typename Device, - bool Vectorize = - (TensorEvaluator<typename Self::ChildTypeNoConst, Device>::PacketAccess && - internal::reducer_traits<Reducer, Device>::PacketAccess)> -struct ScanLauncher { - void operator()(Self& self, typename Self::CoeffReturnType* data) { - Index total_size = internal::array_prod(self.dimensions()); - - // We fix the index along the scan axis to 0 and perform a - // scan per remaining entry. The iteration is split into two nested - // loops to avoid an integer division by keeping track of each idx1 and - // idx2. - for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) { - ReduceBlock<Self, Vectorize, /*Parallel=*/false> block_reducer; - block_reducer(self, idx1, data); - } - } -}; - -#ifdef EIGEN_USE_THREADS - -// Adjust block_size to avoid false sharing of cachelines among -// threads. Currently set to twice the cache line size on Intel and ARM -// processors. -EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) { - EIGEN_CONSTEXPR Index kBlockAlignment = 128; - const Index items_per_cacheline = - numext::maxi<Index>(1, kBlockAlignment / item_size); - return items_per_cacheline * divup(block_size, items_per_cacheline); -} - -template <typename Self> -struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/true> { - EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, - typename Self::CoeffReturnType* data) { - using Scalar = typename Self::CoeffReturnType; - using Packet = typename Self::PacketReturnType; - const int PacketSize = internal::unpacket_traits<Packet>::size; - Index num_scalars = self.stride(); - Index num_packets = 0; - if (self.stride() >= PacketSize) { - num_packets = self.stride() / PacketSize; - self.device().parallelFor( - num_packets, - TensorOpCost(PacketSize * self.size(), PacketSize * self.size(), - 16 * PacketSize * self.size(), true, PacketSize), - // Make the shard size large enough that two neighboring threads - // won't write to the same cacheline of `data`. - [=](Index blk_size) { - return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size); - }, - [&](Index first, Index last) { - for (Index packet = first; packet < last; ++packet) { - const Index idx2 = packet * PacketSize; - ReducePacket(self, idx1 + idx2, data); - } - }); - num_scalars -= num_packets * PacketSize; - } - self.device().parallelFor( - num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()), - // Make the shard size large enough that two neighboring threads - // won't write to the same cacheline of `data`. - [=](Index blk_size) { - return AdjustBlockSize(sizeof(Scalar), blk_size); - }, - [&](Index first, Index last) { - for (Index scalar = first; scalar < last; ++scalar) { - const Index idx2 = num_packets * PacketSize + scalar; - ReduceScalar(self, idx1 + idx2, data); - } - }); - } -}; - -template <typename Self> -struct ReduceBlock<Self, /*Vectorize=*/false, /*Parallel=*/true> { - EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, - typename Self::CoeffReturnType* data) { - using Scalar = typename Self::CoeffReturnType; - self.device().parallelFor( - self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()), - // Make the shard size large enough that two neighboring threads - // won't write to the same cacheline of `data`. - [=](Index blk_size) { - return AdjustBlockSize(sizeof(Scalar), blk_size); - }, - [&](Index first, Index last) { - for (Index idx2 = first; idx2 < last; ++idx2) { - ReduceScalar(self, idx1 + idx2, data); - } - }); - } -}; - -// Specialization for multi-threaded execution. -template <typename Self, typename Reducer, bool Vectorize> -struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> { - void operator()(Self& self, typename Self::CoeffReturnType* data) { - using Scalar = typename Self::CoeffReturnType; - using Packet = typename Self::PacketReturnType; - const int PacketSize = internal::unpacket_traits<Packet>::size; - const Index total_size = internal::array_prod(self.dimensions()); - const Index inner_block_size = self.stride() * self.size(); - bool parallelize_by_outer_blocks = (total_size >= (self.stride() * inner_block_size)); - - if ((parallelize_by_outer_blocks && total_size <= 4096) || - (!parallelize_by_outer_blocks && self.stride() < PacketSize)) { - ScanLauncher<Self, Reducer, DefaultDevice, Vectorize> launcher; - launcher(self, data); - return; - } - - if (parallelize_by_outer_blocks) { - // Parallelize over outer blocks. - const Index num_outer_blocks = total_size / inner_block_size; - self.device().parallelFor( - num_outer_blocks, - TensorOpCost(inner_block_size, inner_block_size, - 16 * PacketSize * inner_block_size, Vectorize, - PacketSize), - [=](Index blk_size) { - return AdjustBlockSize(inner_block_size * sizeof(Scalar), blk_size); - }, - [&](Index first, Index last) { - for (Index idx1 = first; idx1 < last; ++idx1) { - ReduceBlock<Self, Vectorize, /*Parallelize=*/false> block_reducer; - block_reducer(self, idx1 * inner_block_size, data); - } - }); - } else { - // Parallelize over inner packets/scalars dimensions when the reduction - // axis is not an inner dimension. - ReduceBlock<Self, Vectorize, /*Parallelize=*/true> block_reducer; - for (Index idx1 = 0; idx1 < total_size; - idx1 += self.stride() * self.size()) { - block_reducer(self, idx1, data); - } - } - } -}; -#endif // EIGEN_USE_THREADS - -#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) - -// GPU implementation of scan -// TODO(ibab) This placeholder implementation performs multiple scans in -// parallel, but it would be better to use a parallel scan algorithm and -// optimize memory access. -template <typename Self, typename Reducer> -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) { - // Compute offset as in the CPU version - Index val = threadIdx.x + blockIdx.x * blockDim.x; - Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride(); - - if (offset + (self.size() - 1) * self.stride() < total_size) { - // Compute the scan along the axis, starting at the calculated offset - typename Self::CoeffReturnType accum = self.accumulator().initialize(); - for (Index idx = 0; idx < self.size(); idx++) { - Index curr = offset + idx * self.stride(); - if (self.exclusive()) { - data[curr] = self.accumulator().finalize(accum); - self.accumulator().reduce(self.inner().coeff(curr), &accum); - } else { - self.accumulator().reduce(self.inner().coeff(curr), &accum); - data[curr] = self.accumulator().finalize(accum); - } - } - } - __syncthreads(); - -} - -template <typename Self, typename Reducer, bool Vectorize> -struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> { - void operator()(const Self& self, typename Self::CoeffReturnType* data) { - Index total_size = internal::array_prod(self.dimensions()); - Index num_blocks = (total_size / self.size() + 63) / 64; - Index block_size = 64; - - LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data); - } -}; -#endif // EIGEN_USE_GPU && (EIGEN_GPUCC) - -} // namespace internal - -// Eval as rvalue -template <typename Op, typename ArgType, typename Device> -struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { - - typedef TensorScanOp<Op, ArgType> XprType; - typedef typename XprType::Index Index; - typedef const ArgType ChildTypeNoConst; - typedef const ArgType ChildType; - static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; - typedef DSizes<Index, NumDims> Dimensions; - typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self; - typedef StorageMemory<Scalar, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - IsAligned = false, - PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, - RawAccess = true - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), - m_device(device), - m_exclusive(op.exclusive()), - m_accumulator(op.accumulator()), - m_size(m_impl.dimensions()[op.axis()]), - m_stride(1), m_consume_dim(op.axis()), - m_output(NULL) { - - // Accumulating a scalar isn't supported. - EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - eigen_assert(op.axis() >= 0 && op.axis() < NumDims); - - // Compute stride of scan axis - const Dimensions& dims = m_impl.dimensions(); - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - for (int i = 0; i < op.axis(); ++i) { - m_stride = m_stride * dims[i]; - } - } else { - // dims can only be indexed through unsigned integers, - // so let's use an unsigned type to let the compiler knows. - // This prevents stupid warnings: ""'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized in this function" - unsigned int axis = internal::convert_index<unsigned int>(op.axis()); - for (unsigned int i = NumDims - 1; i > axis; --i) { - m_stride = m_stride * dims[i]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { - return m_impl.dimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const { - return m_stride; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const { - return m_consume_dim; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const { - return m_size; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const { - return m_accumulator; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const { - return m_exclusive; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const { - return m_impl; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { - return m_device; - } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { - m_impl.evalSubExprsIfNeeded(NULL); - internal::ScanLauncher<Self, Op, Device> launcher; - if (data) { - launcher(*this, data); - return false; - } - - const Index total_size = internal::array_prod(dimensions()); - m_output = static_cast<EvaluatorPointerType>(m_device.get((Scalar*) m_device.allocate_temp(total_size * sizeof(Scalar)))); - launcher(*this, m_output); - return true; - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { - return internal::ploadt<PacketReturnType, LoadMode>(m_output + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const - { - return m_output; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_output[index]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0); - } - - EIGEN_STRONG_INLINE void cleanup() { - if (m_output) { - m_device.deallocate_temp(m_output); - m_output = NULL; - } - m_impl.cleanup(); - } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - m_output.bind(cgh); - } -#endif -protected: - TensorEvaluator<ArgType, Device> m_impl; - const Device EIGEN_DEVICE_REF m_device; - const bool m_exclusive; - Op m_accumulator; - const Index m_size; - Index m_stride; - Index m_consume_dim; - EvaluatorPointerType m_output; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorScanSycl.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorScanSycl.h deleted file mode 100644 index 7f68ecb..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorScanSycl.h +++ /dev/null @@ -1,513 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorScanSycl.h - * - * \brief: - * Tensor Scan Sycl implement the extend version of - * "Efficient parallel scan algorithms for GPUs." .for Tensor operations. - * The algorithm requires up to 3 stage (consequently 3 kernels) depending on - * the size of the tensor. In the first kernel (ScanKernelFunctor), each - * threads within the work-group individually reduces the allocated elements per - * thread in order to reduces the total number of blocks. In the next step all - * thread within the work-group will reduce the associated blocks into the - * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary - * buffer is given as an input and all the threads within a work-group scan and - * reduces the boundaries between the blocks (generated from the previous - * kernel). and write the data on the temporary buffer. If the second kernel is - * required, the third and final kerenl (ScanAdjustmentKernelFunctor) will - * adjust the final result into the output buffer. - * The original algorithm for the parallel prefix sum can be found here: - * - * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel - * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003 - *1, no. 1 (2008): 1-17. - *****************************************************************/ - -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP - -namespace Eigen { -namespace TensorSycl { -namespace internal { - -#ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE -#define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4) -#endif - -template <typename index_t> -struct ScanParameters { - // must be power of 2 - static EIGEN_CONSTEXPR index_t ScanPerThread = 8; - const index_t total_size; - const index_t non_scan_size; - const index_t scan_size; - const index_t non_scan_stride; - const index_t scan_stride; - const index_t panel_threads; - const index_t group_threads; - const index_t block_threads; - const index_t elements_per_group; - const index_t elements_per_block; - const index_t loop_range; - - ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_, - index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_, - index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_) - : total_size(total_size_), - non_scan_size(non_scan_size_), - scan_size(scan_size_), - non_scan_stride(non_scan_stride_), - scan_stride(scan_stride_), - panel_threads(panel_threads_), - group_threads(group_threads_), - block_threads(block_threads_), - elements_per_group(elements_per_group_), - elements_per_block(elements_per_block_), - loop_range(loop_range_) {} -}; - -enum class scan_step { first, second }; -template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index, - scan_step stp> -struct ScanKernelFunctor { - typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> - LocalAccessor; - static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2; - - LocalAccessor scratch; - Evaluator dev_eval; - OutAccessor out_accessor; - OutAccessor temp_accessor; - const ScanParameters<Index> scanParameters; - Op accumulator; - const bool inclusive; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_, - OutAccessor out_accessor_, OutAccessor temp_accessor_, - const ScanParameters<Index> scanParameters_, Op accumulator_, - const bool inclusive_) - : scratch(scratch_), - dev_eval(dev_eval_), - out_accessor(out_accessor_), - temp_accessor(temp_accessor_), - scanParameters(scanParameters_), - accumulator(accumulator_), - inclusive(inclusive_) {} - - template <scan_step sst = stp, typename Input> - typename ::Eigen::internal::enable_if<sst == scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE - read(const Input &inpt, Index global_id) { - return inpt.coeff(global_id); - } - - template <scan_step sst = stp, typename Input> - typename ::Eigen::internal::enable_if<sst != scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE - read(const Input &inpt, Index global_id) { - return inpt[global_id]; - } - - template <scan_step sst = stp, typename InclusiveOp> - typename ::Eigen::internal::enable_if<sst == scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - first_step_inclusive_Operation(InclusiveOp inclusive_op) { - inclusive_op(); - } - - template <scan_step sst = stp, typename InclusiveOp> - typename ::Eigen::internal::enable_if<sst != scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - first_step_inclusive_Operation(InclusiveOp) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { - auto out_ptr = out_accessor.get_pointer(); - auto tmp_ptr = temp_accessor.get_pointer(); - auto scratch_ptr = scratch.get_pointer().get(); - - for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) { - Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset)); - Index tmp = data_offset % scanParameters.panel_threads; - const Index panel_id = data_offset / scanParameters.panel_threads; - const Index group_id = tmp / scanParameters.group_threads; - tmp = tmp % scanParameters.group_threads; - const Index block_id = tmp / scanParameters.block_threads; - const Index local_id = tmp % scanParameters.block_threads; - // we put one element per packet in scratch_mem - const Index scratch_stride = scanParameters.elements_per_block / PacketSize; - const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride; - CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread]; - CoeffReturnType inclusive_scan; - // the actual panel size is scan_size * non_scan_size. - // elements_per_panel is roundup to power of 2 for binary tree - const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size; - const Index group_offset = group_id * scanParameters.non_scan_stride; - // This will be effective when the size is bigger than elements_per_block - const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride; - const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride); - const Index global_offset = panel_offset + group_offset + block_offset + thread_offset; - Index next_elements = 0; - EIGEN_UNROLL_LOOP - for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) { - Index global_id = global_offset + next_elements; - private_scan[i] = ((((block_id * scanParameters.elements_per_block) + - (ScanParameters<Index>::ScanPerThread * local_id) + i) < scanParameters.scan_size) && - (global_id < scanParameters.total_size)) - ? read(dev_eval, global_id) - : accumulator.initialize(); - next_elements += scanParameters.scan_stride; - } - first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC { - if (inclusive) { - inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1]; - } - }); - // This for loop must be 2 - EIGEN_UNROLL_LOOP - for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) { - Index private_offset = 1; - // build sum in place up the tree - EIGEN_UNROLL_LOOP - for (Index d = PacketSize >> 1; d > 0; d >>= 1) { - EIGEN_UNROLL_LOOP - for (Index l = 0; l < d; l++) { - Index ai = private_offset * (2 * l + 1) - 1 + packetIndex; - Index bi = private_offset * (2 * l + 2) - 1 + packetIndex; - CoeffReturnType accum = accumulator.initialize(); - accumulator.reduce(private_scan[ai], &accum); - accumulator.reduce(private_scan[bi], &accum); - private_scan[bi] = accumulator.finalize(accum); - } - private_offset *= 2; - } - scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] = - private_scan[PacketSize - 1 + packetIndex]; - private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize(); - // traverse down tree & build scan - EIGEN_UNROLL_LOOP - for (Index d = 1; d < PacketSize; d *= 2) { - private_offset >>= 1; - EIGEN_UNROLL_LOOP - for (Index l = 0; l < d; l++) { - Index ai = private_offset * (2 * l + 1) - 1 + packetIndex; - Index bi = private_offset * (2 * l + 2) - 1 + packetIndex; - CoeffReturnType accum = accumulator.initialize(); - accumulator.reduce(private_scan[ai], &accum); - accumulator.reduce(private_scan[bi], &accum); - private_scan[ai] = private_scan[bi]; - private_scan[bi] = accumulator.finalize(accum); - } - } - } - - Index offset = 1; - // build sum in place up the tree - for (Index d = scratch_stride >> 1; d > 0; d >>= 1) { - // Synchronise - itemID.barrier(cl::sycl::access::fence_space::local_space); - if (local_id < d) { - Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset; - Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset; - CoeffReturnType accum = accumulator.initialize(); - accumulator.reduce(scratch_ptr[ai], &accum); - accumulator.reduce(scratch_ptr[bi], &accum); - scratch_ptr[bi] = accumulator.finalize(accum); - } - offset *= 2; - } - // Synchronise - itemID.barrier(cl::sycl::access::fence_space::local_space); - // next step optimisation - if (local_id == 0) { - if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) { - const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) * - scanParameters.non_scan_size + - group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) + - block_id; - tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset]; - } - // clear the last element - scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize(); - } - // traverse down tree & build scan - for (Index d = 1; d < scratch_stride; d *= 2) { - offset >>= 1; - // Synchronise - itemID.barrier(cl::sycl::access::fence_space::local_space); - if (local_id < d) { - Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset; - Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset; - CoeffReturnType accum = accumulator.initialize(); - accumulator.reduce(scratch_ptr[ai], &accum); - accumulator.reduce(scratch_ptr[bi], &accum); - scratch_ptr[ai] = scratch_ptr[bi]; - scratch_ptr[bi] = accumulator.finalize(accum); - } - } - // Synchronise - itemID.barrier(cl::sycl::access::fence_space::local_space); - // This for loop must be 2 - EIGEN_UNROLL_LOOP - for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) { - EIGEN_UNROLL_LOOP - for (Index i = 0; i < PacketSize; i++) { - CoeffReturnType accum = private_scan[packetIndex + i]; - accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum); - private_scan[packetIndex + i] = accumulator.finalize(accum); - } - } - first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC { - if (inclusive) { - accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan); - private_scan[0] = accumulator.finalize(inclusive_scan); - } - }); - next_elements = 0; - // right the first set of private param - EIGEN_UNROLL_LOOP - for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) { - Index global_id = global_offset + next_elements; - if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) < - scanParameters.scan_size) && - (global_id < scanParameters.total_size)) { - Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive)); - out_ptr[global_id] = private_scan[private_id]; - } - next_elements += scanParameters.scan_stride; - } - } // end for loop - } -}; - -template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index> -struct ScanAdjustmentKernelFunctor { - typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> - LocalAccessor; - static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2; - InAccessor in_accessor; - OutAccessor out_accessor; - const ScanParameters<Index> scanParameters; - Op accumulator; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_, - OutAccessor out_accessor_, - const ScanParameters<Index> scanParameters_, - Op accumulator_) - : in_accessor(in_accessor_), - out_accessor(out_accessor_), - scanParameters(scanParameters_), - accumulator(accumulator_) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { - auto in_ptr = in_accessor.get_pointer(); - auto out_ptr = out_accessor.get_pointer(); - - for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) { - Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset)); - Index tmp = data_offset % scanParameters.panel_threads; - const Index panel_id = data_offset / scanParameters.panel_threads; - const Index group_id = tmp / scanParameters.group_threads; - tmp = tmp % scanParameters.group_threads; - const Index block_id = tmp / scanParameters.block_threads; - const Index local_id = tmp % scanParameters.block_threads; - - // the actual panel size is scan_size * non_scan_size. - // elements_per_panel is roundup to power of 2 for binary tree - const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size; - const Index group_offset = group_id * scanParameters.non_scan_stride; - // This will be effective when the size is bigger than elements_per_block - const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride; - const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride; - - const Index global_offset = panel_offset + group_offset + block_offset + thread_offset; - const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block; - const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id; - CoeffReturnType adjust_val = in_ptr[in_id]; - - Index next_elements = 0; - EIGEN_UNROLL_LOOP - for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) { - Index global_id = global_offset + next_elements; - if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) < - scanParameters.scan_size) && - (global_id < scanParameters.total_size)) { - CoeffReturnType accum = adjust_val; - accumulator.reduce(out_ptr[global_id], &accum); - out_ptr[global_id] = accumulator.finalize(accum); - } - next_elements += scanParameters.scan_stride; - } - } - } -}; - -template <typename Index> -struct ScanInfo { - const Index &total_size; - const Index &scan_size; - const Index &panel_size; - const Index &non_scan_size; - const Index &scan_stride; - const Index &non_scan_stride; - - Index max_elements_per_block; - Index block_size; - Index panel_threads; - Index group_threads; - Index block_threads; - Index elements_per_group; - Index elements_per_block; - Index loop_range; - Index global_range; - Index local_range; - const Eigen::SyclDevice &dev; - EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_, - const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_, - const Eigen::SyclDevice &dev_) - : total_size(total_size_), - scan_size(scan_size_), - panel_size(panel_size_), - non_scan_size(non_scan_size_), - scan_stride(scan_stride_), - non_scan_stride(non_scan_stride_), - dev(dev_) { - // must be power of 2 - local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()), - Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1)); - - max_elements_per_block = local_range * ScanParameters<Index>::ScanPerThread; - - elements_per_group = - dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true); - const Index elements_per_panel = elements_per_group * non_scan_size; - elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block)); - panel_threads = elements_per_panel / ScanParameters<Index>::ScanPerThread; - group_threads = elements_per_group / ScanParameters<Index>::ScanPerThread; - block_threads = elements_per_block / ScanParameters<Index>::ScanPerThread; - block_size = elements_per_group / elements_per_block; -#ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE - const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE)); -#else - const Index max_threads = panel_threads * panel_size; -#endif - global_range = roundUp(max_threads, local_range); - loop_range = Index( - std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread))); - } - inline ScanParameters<Index> get_scan_parameter() { - return ScanParameters<Index>(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads, - group_threads, block_threads, elements_per_group, elements_per_block, loop_range); - } - inline cl::sycl::nd_range<1> get_thread_range() { - return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); - } -}; - -template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index> -struct SYCLAdjustBlockOffset { - EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr, - Reducer &accumulator, const Index total_size, - const Index scan_size, const Index panel_size, - const Index non_scan_size, const Index scan_stride, - const Index non_scan_stride, const Eigen::SyclDevice &dev) { - auto scan_info = - ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev); - - typedef ScanAdjustmentKernelFunctor<CoeffReturnType, EvaluatorPointerType, EvaluatorPointerType, Reducer, Index> - AdjustFuctor; - dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(), - scan_info.max_elements_per_block, - scan_info.get_scan_parameter(), accumulator); - } -}; - -template <typename CoeffReturnType, scan_step stp> -struct ScanLauncher_impl { - template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index> - EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator, - const Index total_size, const Index scan_size, const Index panel_size, - const Index non_scan_size, const Index scan_stride, - const Index non_scan_stride, const bool inclusive, - const Eigen::SyclDevice &dev) { - auto scan_info = - ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev); - const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size; - const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2); - CoeffReturnType *temp_pointer = - static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType))); - EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer); - - typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor; - dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>( - in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size, - scan_info.get_scan_parameter(), accumulator, inclusive); - - if (scan_info.block_size > 1) { - ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block( - tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size, - non_scan_size, Index(1), scan_info.block_size, false, dev); - - SYCLAdjustBlockOffset<EvaluatorPointerType, CoeffReturnType, Reducer, Index>::adjust_scan_block_offset( - tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, - non_scan_stride, dev); - } - dev.deallocate_temp(temp_pointer); - } -}; - -} // namespace internal -} // namespace TensorSycl -namespace internal { -template <typename Self, typename Reducer, bool vectorize> -struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> { - typedef typename Self::Index Index; - typedef typename Self::CoeffReturnType CoeffReturnType; - typedef typename Self::Storage Storage; - typedef typename Self::EvaluatorPointerType EvaluatorPointerType; - void operator()(Self &self, EvaluatorPointerType data) { - const Index total_size = internal::array_prod(self.dimensions()); - const Index scan_size = self.size(); - const Index scan_stride = self.stride(); - // this is the scan op (can be sum or ...) - auto accumulator = self.accumulator(); - auto inclusive = !self.exclusive(); - auto consume_dim = self.consume_dim(); - auto dev = self.device(); - - auto dims = self.inner().dimensions(); - - Index non_scan_size = 1; - Index panel_size = 1; - if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) { - for (int i = 0; i < consume_dim; i++) { - non_scan_size *= dims[i]; - } - for (int i = consume_dim + 1; i < Self::NumDims; i++) { - panel_size *= dims[i]; - } - } else { - for (int i = Self::NumDims - 1; i > consume_dim; i--) { - non_scan_size *= dims[i]; - } - for (int i = consume_dim - 1; i >= 0; i--) { - panel_size *= dims[i]; - } - } - const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size; - auto eval_impl = self.inner(); - TensorSycl::internal::ScanLauncher_impl<CoeffReturnType, TensorSycl::internal::scan_step::first>::scan_block( - eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, - inclusive, dev); - } -}; -} // namespace internal -} // namespace Eigen - -#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorShuffling.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorShuffling.h deleted file mode 100644 index e5e5efd..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorShuffling.h +++ /dev/null @@ -1,471 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H -#define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H - -namespace Eigen { - -/** \class TensorShuffling - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor shuffling class. - * - * - */ -namespace internal { -template<typename Shuffle, typename XprType> -struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType> -{ - typedef typename XprType::Scalar Scalar; - typedef traits<XprType> XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; -}; - -template<typename Shuffle, typename XprType> -struct eval<TensorShufflingOp<Shuffle, XprType>, Eigen::Dense> -{ - typedef const TensorShufflingOp<Shuffle, XprType>& type; -}; - -template<typename Shuffle, typename XprType> -struct nested<TensorShufflingOp<Shuffle, XprType>, 1, typename eval<TensorShufflingOp<Shuffle, XprType> >::type> -{ - typedef TensorShufflingOp<Shuffle, XprType> type; -}; - -} // end namespace internal - - - -template<typename Shuffle, typename XprType> -class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> > -{ - public: - typedef TensorBase<TensorShufflingOp<Shuffle, XprType> > Base; - typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested; - typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shfl) - : m_xpr(expr), m_shuffle(shfl) {} - - EIGEN_DEVICE_FUNC - const Shuffle& shufflePermutation() const { return m_shuffle; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_xpr; } - - EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorShufflingOp) - - - protected: - typename XprType::Nested m_xpr; - const Shuffle m_shuffle; -}; - - -// Eval as rvalue -template<typename Shuffle, typename ArgType, typename Device> -struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> -{ - typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Self; - typedef TensorShufflingOp<Shuffle, ArgType> XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; - typedef DSizes<Index, NumDims> Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - IsAligned = false, - PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), - BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess, - PreferBlockAccess = true, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - typedef typename internal::remove_const<Scalar>::type ScalarNoConst; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; - - typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims, - Layout, Index> - TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_device(device), - m_impl(op.expression(), device) - { - const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); - const Shuffle& shuffle = op.shufflePermutation(); - m_is_identity = true; - for (int i = 0; i < NumDims; ++i) { - m_shuffle[i] = static_cast<int>(shuffle[i]); - m_dimensions[i] = input_dims[shuffle[i]]; - m_inverseShuffle[shuffle[i]] = i; - if (m_is_identity && shuffle[i] != i) { - m_is_identity = false; - } - } - - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - m_unshuffledInputStrides[0] = 1; - m_outputStrides[0] = 1; - - for (int i = 1; i < NumDims; ++i) { - m_unshuffledInputStrides[i] = - m_unshuffledInputStrides[i - 1] * input_dims[i - 1]; - m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>( - m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1)); - } - } else { - m_unshuffledInputStrides[NumDims - 1] = 1; - m_outputStrides[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_unshuffledInputStrides[i] = - m_unshuffledInputStrides[i + 1] * input_dims[i + 1]; - m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>( - m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1)); - } - } - - for (int i = 0; i < NumDims; ++i) { - m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]]; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - -#ifdef EIGEN_USE_THREADS - template <typename EvalSubExprsCallback> - EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType, EvalSubExprsCallback done) { - m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); - } -#endif // EIGEN_USE_THREADS - - EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - if (m_is_identity) { - return m_impl.coeff(index); - } else { - return m_impl.coeff(srcCoeff(index)); - } - } - - template <int LoadMode, typename Self, bool ImplPacketAccess> - struct PacketLoader { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static PacketReturnType Run(const Self& self, Index index) { - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; - EIGEN_UNROLL_LOOP - for (int i = 0; i < PacketSize; ++i) { - values[i] = self.coeff(index + i); - } - PacketReturnType rslt = internal::pload<PacketReturnType>(values); - return rslt; - } - }; - - template<int LoadMode, typename Self> - struct PacketLoader<LoadMode, Self, true> { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static PacketReturnType Run(const Self& self, Index index) { - if (self.m_is_identity) { - return self.m_impl.template packet<LoadMode>(index); - } else { - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; - EIGEN_UNROLL_LOOP - for (int i = 0; i < PacketSize; ++i) { - values[i] = self.coeff(index + i); - } - PacketReturnType rslt = internal::pload<PacketReturnType>(values); - return rslt; - } - } - }; - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); - return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*this, index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - static const int inner_dim = - Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1; - - const size_t target_size = m_device.firstLevelCacheSize(); - const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim; - - // Shuffled inner dimensions leads to a random memory access, which is not - // captured by default cost model bytes loaded/stored. We add this cost - // explicitly. The number of cycles picked based on the benchmarks. - // TODO(ezhulenev): This number was picked based on a very questionable - // benchmarks, add benchmarks that are representative of real workloads. - using BlockRequirements = internal::TensorBlockResourceRequirements; - if (inner_dim_shuffled) { - return BlockRequirements::uniform<Scalar>(target_size) - .addCostPerCoeff({0, 0, NumDims * 28}); - } else { - return BlockRequirements::skewed<Scalar>(target_size); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool root_of_expr_ast = false) const { - assert(m_impl.data() != NULL); - - typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout> - TensorBlockIO; - typedef typename TensorBlockIO::Dst TensorBlockIODst; - typedef typename TensorBlockIO::Src TensorBlockIOSrc; - - const typename TensorBlock::Storage block_storage = - TensorBlock::prepareStorage( - desc, scratch, /*allow_strided_storage=*/root_of_expr_ast); - - typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides); - TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset())); - - TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(), - block_storage.data()); - - typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle); - TensorBlockIO::Copy(dst, src, dst_to_src_dim_map); - - return block_storage.AsTensorMaterializedBlock(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() : - NumDims * (2 * TensorOpCost::AddCost<Index>() + - 2 * TensorOpCost::MulCost<Index>() + - TensorOpCost::DivCost<Index>()); - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize); - } - - EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex( - Index input_index, - const DSizes<Index, NumDims>& input_block_strides, - const DSizes<Index, NumDims>& output_block_strides, - const DSizes<internal::TensorIntDivisor<Index>, NumDims>& fast_input_block_strides) const { - Index output_index = 0; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = input_index / fast_input_block_strides[i]; - output_index += idx * output_block_strides[m_inverseShuffle[i]]; - input_index -= idx * input_block_strides[i]; - } - return output_index + input_index * - output_block_strides[m_inverseShuffle[0]]; - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = input_index / fast_input_block_strides[i]; - output_index += idx * output_block_strides[m_inverseShuffle[i]]; - input_index -= idx * input_block_strides[i]; - } - return output_index + input_index * - output_block_strides[m_inverseShuffle[NumDims - 1]]; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { - Index inputIndex = 0; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += idx * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - return inputIndex + index * m_inputStrides[0]; - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += idx * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - return inputIndex + index * m_inputStrides[NumDims - 1]; - } - } - - Dimensions m_dimensions; - bool m_is_identity; - array<int, NumDims> m_shuffle; - array<Index, NumDims> m_inverseShuffle; // TODO(ezhulenev): Make it int type. - array<Index, NumDims> m_outputStrides; - array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides; - array<Index, NumDims> m_inputStrides; - array<Index, NumDims> m_unshuffledInputStrides; - - const Device EIGEN_DEVICE_REF m_device; - TensorEvaluator<ArgType, Device> m_impl; -}; - - -// Eval as lvalue -template<typename Shuffle, typename ArgType, typename Device> -struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device> - : public TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> -{ - typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Base; - - typedef TensorShufflingOp<Shuffle, ArgType> XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; - typedef DSizes<Index, NumDims> Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - - enum { - IsAligned = false, - PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), - BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess, - PreferBlockAccess = true, - Layout = TensorEvaluator<ArgType, Device>::Layout, - RawAccess = false - }; - - typedef typename internal::remove_const<Scalar>::type ScalarNoConst; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) - { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) - { - return this->m_impl.coeffRef(this->srcCoeff(index)); - } - - template <int StoreMode> EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; - internal::pstore<CoeffReturnType, PacketReturnType>(values, x); - EIGEN_UNROLL_LOOP - for (int i = 0; i < PacketSize; ++i) { - this->coeffRef(index+i) = values[i]; - } - } - - template <typename TensorBlock> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( - const TensorBlockDesc& desc, const TensorBlock& block) { - eigen_assert(this->m_impl.data() != NULL); - - typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout> - TensorBlockIO; - typedef typename TensorBlockIO::Dst TensorBlockIODst; - typedef typename TensorBlockIO::Src TensorBlockIOSrc; - - const Scalar* block_buffer = block.data(); - - // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen - // expression with coefficient and packet access as `src`. - void* mem = NULL; - if (block_buffer == NULL) { - mem = this->m_device.allocate(desc.size() * sizeof(Scalar)); - ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem); - - typedef internal::TensorBlockAssignment< - ScalarNoConst, NumDims, typename TensorBlock::XprType, Index> - TensorBlockAssignment; - - TensorBlockAssignment::Run( - TensorBlockAssignment::target( - desc.dimensions(), internal::strides<Layout>(desc.dimensions()), - buf), - block.expr()); - - block_buffer = buf; - } - - // Read from block. - TensorBlockIOSrc src(internal::strides<Layout>(desc.dimensions()), - block_buffer); - - // Write to the output buffer. - typename TensorBlockIO::Dimensions output_strides( - this->m_unshuffledInputStrides); - typename TensorBlockIO::Dimensions output_dimensions; - for (int i = 0; i < NumDims; ++i) { - output_dimensions[this->m_shuffle[i]] = desc.dimension(i); - } - TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(), - this->srcCoeff(desc.offset())); - - // Reorder dimensions according to the shuffle. - typename TensorBlockIO::DimensionsMap dst_to_src_dim_map; - for (int i = 0; i < NumDims; ++i) { - dst_to_src_dim_map[i] = static_cast<int>(this->m_inverseShuffle[i]); - } - TensorBlockIO::Copy(dst, src, dst_to_src_dim_map); - - // Deallocate temporary buffer used for the block materialization. - if (mem != NULL) this->m_device.deallocate(mem); - } -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorStorage.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorStorage.h deleted file mode 100644 index 5ff0880..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorStorage.h +++ /dev/null @@ -1,161 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> -// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSORSTORAGE_H -#define EIGEN_CXX11_TENSOR_TENSORSTORAGE_H - -#ifdef EIGEN_TENSOR_STORAGE_CTOR_PLUGIN - #define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN EIGEN_TENSOR_STORAGE_CTOR_PLUGIN; -#else - #define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN -#endif - -namespace Eigen { - -/** \internal - * - * \class TensorStorage - * \ingroup CXX11_Tensor_Module - * - * \brief Stores the data of a tensor - * - * This class stores the data of fixed-size, dynamic-size or mixed tensors - * in a way as compact as possible. - * - * \sa Tensor - */ -template<typename T, typename Dimensions, int Options> class TensorStorage; - - -// Pure fixed-size storage -template<typename T, typename FixedDimensions, int Options_> -class TensorStorage -{ - private: - static const std::size_t Size = FixedDimensions::total_size; - - // Allocate an array of size at least one to prevent compiler warnings. - static const std::size_t MinSize = max_n_1<Size>::size; - EIGEN_ALIGN_MAX T m_data[MinSize]; - - public: - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorStorage() { - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE T *data() { return m_data; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const T *data() const { return m_data; } - - static EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const FixedDimensions& dimensions() - { - static const FixedDimensions* singleton_dimensions = new FixedDimensions(); - return *singleton_dimensions; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE DenseIndex size() const { return Size; } -}; - -// pure dynamic -template<typename T, typename IndexType, int NumIndices_, int Options_> -class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> -{ - public: - typedef IndexType Index; - typedef DSizes<IndexType, NumIndices_> Dimensions; - typedef TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> Self; - - EIGEN_DEVICE_FUNC TensorStorage() : m_data(0), m_dimensions() { - if (NumIndices_ == 0) { - m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1); - } - } - EIGEN_DEVICE_FUNC TensorStorage(internal::constructor_without_unaligned_array_assert) - : m_data(0), m_dimensions(internal::template repeat<NumIndices_, Index>(0)) {} - EIGEN_DEVICE_FUNC TensorStorage(Index size, const array<Index, NumIndices_>& dimensions) - : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions) - { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template <typename... DenseIndex> - EIGEN_DEVICE_FUNC TensorStorage(DenseIndex... indices) : m_dimensions(indices...) { - m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(m_dimensions)); - } -#endif - - EIGEN_DEVICE_FUNC TensorStorage(const Self& other) - : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(other.m_dimensions))) - , m_dimensions(other.m_dimensions) - { - internal::smart_copy(other.m_data, other.m_data+internal::array_prod(other.m_dimensions), m_data); - } - EIGEN_DEVICE_FUNC Self& operator=(const Self& other) - { - if (this != &other) { - Self tmp(other); - this->swap(tmp); - } - return *this; - } - -#if EIGEN_HAS_RVALUE_REFERENCES - EIGEN_DEVICE_FUNC TensorStorage(Self&& other) : TensorStorage() - { - *this = std::move(other); - } - - EIGEN_DEVICE_FUNC Self& operator=(Self&& other) - { - numext::swap(m_data, other.m_data); - numext::swap(m_dimensions, other.m_dimensions); - return *this; - } -#endif - - EIGEN_DEVICE_FUNC ~TensorStorage() { internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions)); } - EIGEN_DEVICE_FUNC void swap(Self& other) - { numext::swap(m_data,other.m_data); numext::swap(m_dimensions,other.m_dimensions); } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {return m_dimensions;} - - EIGEN_DEVICE_FUNC void resize(Index size, const array<Index, NumIndices_>& nbDimensions) - { - const Index currentSz = internal::array_prod(m_dimensions); - if(size != currentSz) - { - internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, currentSz); - if (size) - m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size); - else if (NumIndices_ == 0) { - m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1); - } - else - m_data = 0; - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) - } - m_dimensions = nbDimensions; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T *data() { return m_data; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T *data() const { return m_data; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } - - private: - T *m_data; - Dimensions m_dimensions; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSORSTORAGE_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorStriding.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorStriding.h deleted file mode 100644 index 2f62a66..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorStriding.h +++ /dev/null @@ -1,346 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H -#define EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H - -namespace Eigen { - -/** \class TensorStriding - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor striding class. - * - * - */ -namespace internal { -template<typename Strides, typename XprType> -struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType> -{ - typedef typename XprType::Scalar Scalar; - typedef traits<XprType> XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; -}; - -template<typename Strides, typename XprType> -struct eval<TensorStridingOp<Strides, XprType>, Eigen::Dense> -{ - typedef const TensorStridingOp<Strides, XprType>EIGEN_DEVICE_REF type; -}; - -template<typename Strides, typename XprType> -struct nested<TensorStridingOp<Strides, XprType>, 1, typename eval<TensorStridingOp<Strides, XprType> >::type> -{ - typedef TensorStridingOp<Strides, XprType> type; -}; - -} // end namespace internal - - - -template<typename Strides, typename XprType> -class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> > -{ - public: - typedef TensorBase<TensorStridingOp<Strides, XprType> > Base; - typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested; - typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims) - : m_xpr(expr), m_dims(dims) {} - - EIGEN_DEVICE_FUNC - const Strides& strides() const { return m_dims; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_xpr; } - - EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingOp) - - protected: - typename XprType::Nested m_xpr; - const Strides m_dims; -}; - - -// Eval as rvalue -template<typename Strides, typename ArgType, typename Device> -struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> -{ - typedef TensorStridingOp<Strides, ArgType> XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; - typedef DSizes<Index, NumDims> Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = false, - PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) - { - m_dimensions = m_impl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - m_dimensions[i] =Eigen::numext::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]); - } - - const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - m_outputStrides[0] = 1; - m_inputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_inputStrides[i-1] *= op.strides()[i-1]; - } - m_inputStrides[NumDims-1] *= op.strides()[NumDims-1]; - } else { // RowMajor - m_outputStrides[NumDims-1] = 1; - m_inputStrides[NumDims-1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; - m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; - m_inputStrides[i+1] *= op.strides()[i+1]; - } - m_inputStrides[0] *= op.strides()[0]; - } - } - - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType/*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_impl.coeff(srcCoeff(index)); - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - Index inputIndices[] = {0, 0}; - Index indices[] = {index, index + PacketSize - 1}; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - EIGEN_UNROLL_LOOP - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_outputStrides[i]; - const Index idx1 = indices[1] / m_outputStrides[i]; - inputIndices[0] += idx0 * m_inputStrides[i]; - inputIndices[1] += idx1 * m_inputStrides[i]; - indices[0] -= idx0 * m_outputStrides[i]; - indices[1] -= idx1 * m_outputStrides[i]; - } - inputIndices[0] += indices[0] * m_inputStrides[0]; - inputIndices[1] += indices[1] * m_inputStrides[0]; - } else { // RowMajor - EIGEN_UNROLL_LOOP - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx0 = indices[0] / m_outputStrides[i]; - const Index idx1 = indices[1] / m_outputStrides[i]; - inputIndices[0] += idx0 * m_inputStrides[i]; - inputIndices[1] += idx1 * m_inputStrides[i]; - indices[0] -= idx0 * m_outputStrides[i]; - indices[1] -= idx1 * m_outputStrides[i]; - } - inputIndices[0] += indices[0] * m_inputStrides[NumDims-1]; - inputIndices[1] += indices[1] * m_inputStrides[NumDims-1]; - } - if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { - PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]); - return rslt; - } - else { - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; - values[0] = m_impl.coeff(inputIndices[0]); - values[PacketSize-1] = m_impl.coeff(inputIndices[1]); - EIGEN_UNROLL_LOOP - for (int i = 1; i < PacketSize-1; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload<PacketReturnType>(values); - return rslt; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - double compute_cost = (NumDims - 1) * (TensorOpCost::AddCost<Index>() + - TensorOpCost::MulCost<Index>() + - TensorOpCost::DivCost<Index>()) + - TensorOpCost::MulCost<Index>(); - if (vectorized) { - compute_cost *= 2; // packet() computes two indices - } - const int innerDim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : (NumDims - 1); - return m_impl.costPerCoeff(vectorized && m_inputStrides[innerDim] == 1) + - // Computation is not vectorized per se, but it is done once per packet. - TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const - { - Index inputIndex = 0; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - EIGEN_UNROLL_LOOP - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - inputIndex += idx * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - inputIndex += index * m_inputStrides[0]; - } else { // RowMajor - EIGEN_UNROLL_LOOP - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i]; - inputIndex += idx * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - inputIndex += index * m_inputStrides[NumDims-1]; - } - return inputIndex; - } - - Dimensions m_dimensions; - array<Index, NumDims> m_outputStrides; - array<Index, NumDims> m_inputStrides; - TensorEvaluator<ArgType, Device> m_impl; -}; - -// Eval as lvalue -template<typename Strides, typename ArgType, typename Device> -struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> - : public TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> -{ - typedef TensorStridingOp<Strides, ArgType> XprType; - typedef TensorEvaluator<const XprType, Device> Base; - // typedef typename XprType::Index Index; - static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; - // typedef DSizes<Index, NumDims> Dimensions; - - enum { - IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - PreferBlockAccess = false, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) { } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) - { - return this->m_impl.coeffRef(this->srcCoeff(index)); - } - - template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize()); - - Index inputIndices[] = {0, 0}; - Index indices[] = {index, index + PacketSize - 1}; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - EIGEN_UNROLL_LOOP - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / this->m_outputStrides[i]; - const Index idx1 = indices[1] / this->m_outputStrides[i]; - inputIndices[0] += idx0 * this->m_inputStrides[i]; - inputIndices[1] += idx1 * this->m_inputStrides[i]; - indices[0] -= idx0 * this->m_outputStrides[i]; - indices[1] -= idx1 * this->m_outputStrides[i]; - } - inputIndices[0] += indices[0] * this->m_inputStrides[0]; - inputIndices[1] += indices[1] * this->m_inputStrides[0]; - } else { // RowMajor - EIGEN_UNROLL_LOOP - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx0 = indices[0] / this->m_outputStrides[i]; - const Index idx1 = indices[1] / this->m_outputStrides[i]; - inputIndices[0] += idx0 * this->m_inputStrides[i]; - inputIndices[1] += idx1 * this->m_inputStrides[i]; - indices[0] -= idx0 * this->m_outputStrides[i]; - indices[1] -= idx1 * this->m_outputStrides[i]; - } - inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1]; - inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1]; - } - if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { - this->m_impl.template writePacket<Unaligned>(inputIndices[0], x); - } - else { - EIGEN_ALIGN_MAX Scalar values[PacketSize]; - internal::pstore<Scalar, PacketReturnType>(values, x); - this->m_impl.coeffRef(inputIndices[0]) = values[0]; - this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1]; - EIGEN_UNROLL_LOOP - for (int i = 1; i < PacketSize-1; ++i) { - this->coeffRef(index+i) = values[i]; - } - } - } -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorTrace.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorTrace.h deleted file mode 100644 index 926ecdd..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorTrace.h +++ /dev/null @@ -1,303 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2017 Gagan Goel <gagan.nith@gmail.com> -// Copyright (C) 2017 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRACE_H -#define EIGEN_CXX11_TENSOR_TENSOR_TRACE_H - -namespace Eigen { - -/** \class TensorTrace - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor Trace class. - * - * - */ - -namespace internal { -template<typename Dims, typename XprType> -struct traits<TensorTraceOp<Dims, XprType> > : public traits<XprType> -{ - typedef typename XprType::Scalar Scalar; - typedef traits<XprType> XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value; - static const int Layout = XprTraits::Layout; -}; - -template<typename Dims, typename XprType> -struct eval<TensorTraceOp<Dims, XprType>, Eigen::Dense> -{ - typedef const TensorTraceOp<Dims, XprType>& type; -}; - -template<typename Dims, typename XprType> -struct nested<TensorTraceOp<Dims, XprType>, 1, typename eval<TensorTraceOp<Dims, XprType> >::type> -{ - typedef TensorTraceOp<Dims, XprType> type; -}; - -} // end namespace internal - - -template<typename Dims, typename XprType> -class TensorTraceOp : public TensorBase<TensorTraceOp<Dims, XprType> > -{ - public: - typedef typename Eigen::internal::traits<TensorTraceOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested<TensorTraceOp>::type Nested; - typedef typename Eigen::internal::traits<TensorTraceOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorTraceOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTraceOp(const XprType& expr, const Dims& dims) - : m_xpr(expr), m_dims(dims) { - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Dims& dims() const { return m_dims; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const typename internal::remove_all<typename XprType::Nested>::type& expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const Dims m_dims; -}; - - -// Eval as rvalue -template<typename Dims, typename ArgType, typename Device> -struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device> -{ - typedef TensorTraceOp<Dims, ArgType> XprType; - static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; - static const int NumReducedDims = internal::array_size<Dims>::value; - static const int NumOutputDims = NumInputDims - NumReducedDims; - typedef typename XprType::Index Index; - typedef DSizes<Index, NumOutputDims> Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - IsAligned = false, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = false, - PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_traceDim(1), m_device(device) - { - - EIGEN_STATIC_ASSERT((NumOutputDims >= 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT((NumReducedDims >= 2) || ((NumReducedDims == 0) && (NumInputDims == 0)), YOU_MADE_A_PROGRAMMING_MISTAKE); - - for (int i = 0; i < NumInputDims; ++i) { - m_reduced[i] = false; - } - - const Dims& op_dims = op.dims(); - for (int i = 0; i < NumReducedDims; ++i) { - eigen_assert(op_dims[i] >= 0); - eigen_assert(op_dims[i] < NumInputDims); - m_reduced[op_dims[i]] = true; - } - - // All the dimensions should be distinct to compute the trace - int num_distinct_reduce_dims = 0; - for (int i = 0; i < NumInputDims; ++i) { - if (m_reduced[i]) { - ++num_distinct_reduce_dims; - } - } - - eigen_assert(num_distinct_reduce_dims == NumReducedDims); - - // Compute the dimensions of the result. - const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); - - int output_index = 0; - int reduced_index = 0; - for (int i = 0; i < NumInputDims; ++i) { - if (m_reduced[i]) { - m_reducedDims[reduced_index] = input_dims[i]; - if (reduced_index > 0) { - // All the trace dimensions must have the same size - eigen_assert(m_reducedDims[0] == m_reducedDims[reduced_index]); - } - ++reduced_index; - } - else { - m_dimensions[output_index] = input_dims[i]; - ++output_index; - } - } - - if (NumReducedDims != 0) { - m_traceDim = m_reducedDims[0]; - } - - // Compute the output strides - if (NumOutputDims > 0) { - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - m_outputStrides[0] = 1; - for (int i = 1; i < NumOutputDims; ++i) { - m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; - } - } - else { - m_outputStrides.back() = 1; - for (int i = NumOutputDims - 2; i >= 0; --i) { - m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; - } - } - } - - // Compute the input strides - if (NumInputDims > 0) { - array<Index, NumInputDims> input_strides; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - input_strides[0] = 1; - for (int i = 1; i < NumInputDims; ++i) { - input_strides[i] = input_strides[i - 1] * input_dims[i - 1]; - } - } - else { - input_strides.back() = 1; - for (int i = NumInputDims - 2; i >= 0; --i) { - input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; - } - } - - output_index = 0; - reduced_index = 0; - for (int i = 0; i < NumInputDims; ++i) { - if(m_reduced[i]) { - m_reducedStrides[reduced_index] = input_strides[i]; - ++reduced_index; - } - else { - m_preservedStrides[output_index] = input_strides[i]; - ++output_index; - } - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { - return m_dimensions; - } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - - EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - // Initialize the result - CoeffReturnType result = internal::cast<int, CoeffReturnType>(0); - Index index_stride = 0; - for (int i = 0; i < NumReducedDims; ++i) { - index_stride += m_reducedStrides[i]; - } - - // If trace is requested along all dimensions, starting index would be 0 - Index cur_index = 0; - if (NumOutputDims != 0) - cur_index = firstInput(index); - for (Index i = 0; i < m_traceDim; ++i) { - result += m_impl.coeff(cur_index); - cur_index += index_stride; - } - - return result; - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE); - eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); - - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index + i); - } - PacketReturnType result = internal::ploadt<PacketReturnType, LoadMode>(values); - return result; - } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - - protected: - // Given the output index, finds the first index in the input tensor used to compute the trace - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { - Index startInput = 0; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - for (int i = NumOutputDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - startInput += idx * m_preservedStrides[i]; - index -= idx * m_outputStrides[i]; - } - startInput += index * m_preservedStrides[0]; - } - else { - for (int i = 0; i < NumOutputDims - 1; ++i) { - const Index idx = index / m_outputStrides[i]; - startInput += idx * m_preservedStrides[i]; - index -= idx * m_outputStrides[i]; - } - startInput += index * m_preservedStrides[NumOutputDims - 1]; - } - return startInput; - } - - Dimensions m_dimensions; - TensorEvaluator<ArgType, Device> m_impl; - // Initialize the size of the trace dimension - Index m_traceDim; - const Device EIGEN_DEVICE_REF m_device; - array<bool, NumInputDims> m_reduced; - array<Index, NumReducedDims> m_reducedDims; - array<Index, NumOutputDims> m_outputStrides; - array<Index, NumReducedDims> m_reducedStrides; - array<Index, NumOutputDims> m_preservedStrides; -}; - - -} // End namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_TRACE_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorTraits.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorTraits.h deleted file mode 100644 index 4f7fd34..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorTraits.h +++ /dev/null @@ -1,264 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H -#define EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H - -namespace Eigen { -namespace internal { - - -template<typename Scalar, int Options> -class compute_tensor_flags -{ - enum { - is_dynamic_size_storage = 1, - - is_aligned = - ( - ((Options&DontAlign)==0) && ( -#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 - (!is_dynamic_size_storage) -#else - 0 -#endif - | -#if EIGEN_MAX_ALIGN_BYTES>0 - is_dynamic_size_storage -#else - 0 -#endif - ) - ), - packet_access_bit = packet_traits<Scalar>::Vectorizable && is_aligned ? PacketAccessBit : 0 - }; - - public: - enum { ret = packet_access_bit }; -}; - - -template<typename Scalar_, int NumIndices_, int Options_, typename IndexType_> -struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > -{ - typedef Scalar_ Scalar; - typedef Dense StorageKind; - typedef IndexType_ Index; - static const int NumDimensions = NumIndices_; - static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor; - enum { - Options = Options_, - Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0 : LvalueBit) - }; - template <typename T> struct MakePointer { - typedef T* Type; - }; - typedef typename MakePointer<Scalar>::Type PointerType; -}; - - -template<typename Scalar_, typename Dimensions, int Options_, typename IndexType_> -struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> > -{ - typedef Scalar_ Scalar; - typedef Dense StorageKind; - typedef IndexType_ Index; - static const int NumDimensions = array_size<Dimensions>::value; - static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor; - enum { - Options = Options_, - Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0: LvalueBit) - }; - template <typename T> struct MakePointer { - typedef T* Type; - }; - typedef typename MakePointer<Scalar>::Type PointerType; -}; - - -template<typename PlainObjectType, int Options_, template <class> class MakePointer_> -struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> > - : public traits<PlainObjectType> -{ - typedef traits<PlainObjectType> BaseTraits; - typedef typename BaseTraits::Scalar Scalar; - typedef typename BaseTraits::StorageKind StorageKind; - typedef typename BaseTraits::Index Index; - static const int NumDimensions = BaseTraits::NumDimensions; - static const int Layout = BaseTraits::Layout; - enum { - Options = Options_, - Flags = BaseTraits::Flags - }; - template <class T> struct MakePointer { - // Intermediate typedef to workaround MSVC issue. - typedef MakePointer_<T> MakePointerT; - typedef typename MakePointerT::Type Type; - }; - typedef typename MakePointer<Scalar>::Type PointerType; -}; - -template<typename PlainObjectType> -struct traits<TensorRef<PlainObjectType> > - : public traits<PlainObjectType> -{ - typedef traits<PlainObjectType> BaseTraits; - typedef typename BaseTraits::Scalar Scalar; - typedef typename BaseTraits::StorageKind StorageKind; - typedef typename BaseTraits::Index Index; - static const int NumDimensions = BaseTraits::NumDimensions; - static const int Layout = BaseTraits::Layout; - enum { - Options = BaseTraits::Options, - Flags = BaseTraits::Flags - }; - typedef typename BaseTraits::PointerType PointerType; -}; - - -template<typename _Scalar, int NumIndices_, int Options, typename IndexType_> -struct eval<Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense> -{ - typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type; -}; - -template<typename _Scalar, int NumIndices_, int Options, typename IndexType_> -struct eval<const Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense> -{ - typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type; -}; - -template<typename Scalar_, typename Dimensions, int Options, typename IndexType_> -struct eval<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense> -{ - typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type; -}; - -template<typename Scalar_, typename Dimensions, int Options, typename IndexType_> -struct eval<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense> -{ - typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type; -}; - -template<typename PlainObjectType, int Options, template <class> class MakePointer> -struct eval<TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense> -{ - typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type; -}; - -template<typename PlainObjectType, int Options, template <class> class MakePointer> -struct eval<const TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense> -{ - typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type; -}; - -template<typename PlainObjectType> -struct eval<TensorRef<PlainObjectType>, Eigen::Dense> -{ - typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type; -}; - -template<typename PlainObjectType> -struct eval<const TensorRef<PlainObjectType>, Eigen::Dense> -{ - typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type; -}; - -// TODO nested<> does not exist anymore in Eigen/Core, and it thus has to be removed in favor of ref_selector. -template<typename T, int n=1, typename PlainObject = void> struct nested -{ - typedef typename ref_selector<T>::type type; -}; - -template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_> -struct nested<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > -{ - typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type; -}; - -template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_> -struct nested<const Tensor<Scalar_, NumIndices_, Options_, IndexType_> > -{ - typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type; -}; - -template <typename Scalar_, typename Dimensions, int Options, typename IndexType_> -struct nested<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> > -{ - typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type; -}; - -template <typename Scalar_, typename Dimensions, int Options, typename IndexType_> -struct nested<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> > -{ - typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type; -}; - - -template <typename PlainObjectType> -struct nested<TensorRef<PlainObjectType> > -{ - typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type; -}; - -template <typename PlainObjectType> -struct nested<const TensorRef<PlainObjectType> > -{ - typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type; -}; - -} // end namespace internal - -// Convolutional layers take in an input tensor of shape (D, R, C, B), or (D, C, -// R, B), and convolve it with a set of filters, which can also be presented as -// a tensor (D, K, K, M), where M is the number of filters, K is the filter -// size, and each 3-dimensional tensor of size (D, K, K) is a filter. For -// simplicity we assume that we always use square filters (which is usually the -// case in images), hence the two Ks in the tensor dimension. It also takes in -// a few additional parameters: -// Stride (S): The convolution stride is the offset between locations where we -// apply the filters. A larger stride means that the output will be -// spatially smaller. -// Padding (P): The padding we apply to the input tensor along the R and C -// dimensions. This is usually used to make sure that the spatial -// dimensions of the output matches our intention. -// -// Two types of padding are often used: -// SAME: The pad value is computed so that the output will have size -// R/S and C/S. -// VALID: no padding is carried out. -// When we do padding, the padded values at the padded locations are usually -// zero. -// -// The output dimensions for convolution, when given all the parameters above, -// are as follows: -// When Padding = SAME: the output size is (B, R', C', M), where -// R' = ceil(float(R) / float(S)) -// C' = ceil(float(C) / float(S)) -// where ceil is the ceiling function. The input tensor is padded with 0 as -// needed. The number of padded rows and columns are computed as: -// Pr = ((R' - 1) * S + K - R) / 2 -// Pc = ((C' - 1) * S + K - C) / 2 -// when the stride is 1, we have the simplified case R'=R, C'=C, Pr=Pc=(K-1)/2. -// This is where SAME comes from - the output has the same size as the input has. -// When Padding = VALID: the output size is computed as -// R' = ceil(float(R - K + 1) / float(S)) -// C' = ceil(float(C - K + 1) / float(S)) -// and the number of padded rows and columns are computed in the same way as in -// the SAME case. -// When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0, -// Pc=0. -typedef enum { - PADDING_VALID = 1, - PADDING_SAME = 2 -} PaddingType; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorUInt128.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorUInt128.h deleted file mode 100644 index d23f2e4..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorUInt128.h +++ /dev/null @@ -1,249 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_UINT128_H -#define EIGEN_CXX11_TENSOR_TENSOR_UINT128_H - -namespace Eigen { -namespace internal { - - -template <uint64_t n> -struct static_val { - static const uint64_t value = n; - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() { } - - template <typename T> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) { - EIGEN_UNUSED_VARIABLE(v); - eigen_assert(v == n); - } -}; - - -template <typename HIGH = uint64_t, typename LOW = uint64_t> -struct TensorUInt128 -{ - HIGH high; - LOW low; - - template<typename OTHER_HIGH, typename OTHER_LOW> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128(const TensorUInt128<OTHER_HIGH, OTHER_LOW>& other) : high(other.high), low(other.low) { - EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE); - } - - template<typename OTHER_HIGH, typename OTHER_LOW> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128& operator = (const TensorUInt128<OTHER_HIGH, OTHER_LOW>& other) { - EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE); - high = other.high; - low = other.low; - return *this; - } - - template<typename T> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - explicit TensorUInt128(const T& x) : high(0), low(x) { - eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= NumTraits<uint64_t>::highest())); - eigen_assert(x >= 0); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128(HIGH y, LOW x) : high(y), low(x) { } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator LOW() const { - return low; - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LOW lower() const { - return low; - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HIGH upper() const { - return high; - } -}; - - -template <typename HL, typename LL, typename HR, typename LR> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) -{ - return (lhs.high == rhs.high) & (lhs.low == rhs.low); -} - -template <typename HL, typename LL, typename HR, typename LR> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) -{ - return (lhs.high != rhs.high) | (lhs.low != rhs.low); -} - -template <typename HL, typename LL, typename HR, typename LR> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) -{ - if (lhs.high != rhs.high) { - return lhs.high > rhs.high; - } - return lhs.low >= rhs.low; -} - -template <typename HL, typename LL, typename HR, typename LR> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) -{ - if (lhs.high != rhs.high) { - return lhs.high < rhs.high; - } - return lhs.low < rhs.low; -} - -template <typename HL, typename LL, typename HR, typename LR> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) -{ - TensorUInt128<uint64_t, uint64_t> result(lhs.high + rhs.high, lhs.low + rhs.low); - if (result.low < rhs.low) { - result.high += 1; - } - return result; -} - -template <typename HL, typename LL, typename HR, typename LR> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) -{ - TensorUInt128<uint64_t, uint64_t> result(lhs.high - rhs.high, lhs.low - rhs.low); - if (result.low > lhs.low) { - result.high -= 1; - } - return result; -} - - -template <typename HL, typename LL, typename HR, typename LR> -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) -{ - // Split each 128-bit integer into 4 32-bit integers, and then do the - // multiplications by hand as follow: - // lhs a b c d - // rhs e f g h - // ----------- - // ah bh ch dh - // bg cg dg - // cf df - // de - // The result is stored in 2 64bit integers, high and low. - - const uint64_t LOW = 0x00000000FFFFFFFFLL; - const uint64_t HIGH = 0xFFFFFFFF00000000LL; - - uint64_t d = lhs.low & LOW; - uint64_t c = (lhs.low & HIGH) >> 32LL; - uint64_t b = lhs.high & LOW; - uint64_t a = (lhs.high & HIGH) >> 32LL; - - uint64_t h = rhs.low & LOW; - uint64_t g = (rhs.low & HIGH) >> 32LL; - uint64_t f = rhs.high & LOW; - uint64_t e = (rhs.high & HIGH) >> 32LL; - - // Compute the low 32 bits of low - uint64_t acc = d * h; - uint64_t low = acc & LOW; - // Compute the high 32 bits of low. Add a carry every time we wrap around - acc >>= 32LL; - uint64_t carry = 0; - uint64_t acc2 = acc + c * h; - if (acc2 < acc) { - carry++; - } - acc = acc2 + d * g; - if (acc < acc2) { - carry++; - } - low |= (acc << 32LL); - - // Carry forward the high bits of acc to initiate the computation of the - // low 32 bits of high - acc2 = (acc >> 32LL) | (carry << 32LL); - carry = 0; - - acc = acc2 + b * h; - if (acc < acc2) { - carry++; - } - acc2 = acc + c * g; - if (acc2 < acc) { - carry++; - } - acc = acc2 + d * f; - if (acc < acc2) { - carry++; - } - uint64_t high = acc & LOW; - - // Start to compute the high 32 bits of high. - acc2 = (acc >> 32LL) | (carry << 32LL); - - acc = acc2 + a * h; - acc2 = acc + b * g; - acc = acc2 + c * f; - acc2 = acc + d * e; - high |= (acc2 << 32LL); - - return TensorUInt128<uint64_t, uint64_t>(high, low); -} - -template <typename HL, typename LL, typename HR, typename LR> -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) -{ - if (rhs == TensorUInt128<static_val<0>, static_val<1> >(1)) { - return TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low); - } else if (lhs < rhs) { - return TensorUInt128<uint64_t, uint64_t>(0); - } else { - // calculate the biggest power of 2 times rhs that's less than or equal to lhs - TensorUInt128<uint64_t, uint64_t> power2(1); - TensorUInt128<uint64_t, uint64_t> d(rhs); - TensorUInt128<uint64_t, uint64_t> tmp(lhs - d); - while (lhs >= d) { - tmp = tmp - d; - d = d + d; - power2 = power2 + power2; - } - - tmp = TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low); - TensorUInt128<uint64_t, uint64_t> result(0); - while (power2 != TensorUInt128<static_val<0>, static_val<0> >(0)) { - if (tmp >= d) { - tmp = tmp - d; - result = result + power2; - } - // Shift right - power2 = TensorUInt128<uint64_t, uint64_t>(power2.high >> 1, (power2.low >> 1) | (power2.high << 63)); - d = TensorUInt128<uint64_t, uint64_t>(d.high >> 1, (d.low >> 1) | (d.high << 63)); - } - - return result; - } -} - - -} // namespace internal -} // namespace Eigen - - -#endif // EIGEN_CXX11_TENSOR_TENSOR_UINT128_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorVolumePatch.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorVolumePatch.h deleted file mode 100644 index 0beb9ff..0000000 --- a/src/EigenUnsupported/CXX11/src/Tensor/TensorVolumePatch.h +++ /dev/null @@ -1,629 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H -#define EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H - -namespace Eigen { - -/** \class TensorVolumePatch - * \ingroup CXX11_Tensor_Module - * - * \brief Patch extraction specialized for processing of volumetric data. - * This assumes that the input has a least 4 dimensions ordered as follows: - * - channels - * - planes - * - rows - * - columns - * - (optional) additional dimensions such as time or batch size. - * Calling the volume patch code with patch_planes, patch_rows, and patch_cols - * is equivalent to calling the regular patch extraction code with parameters - * d, patch_planes, patch_rows, patch_cols, and 1 for all the additional - * dimensions. - */ -namespace internal { - -template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> -struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits<XprType> -{ - typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; - typedef traits<XprType> XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions + 1; - static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; - -}; - -template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> -struct eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Eigen::Dense> -{ - typedef const TensorVolumePatchOp<Planes, Rows, Cols, XprType>& type; -}; - -template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> -struct nested<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, 1, typename eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType> >::type> -{ - typedef TensorVolumePatchOp<Planes, Rows, Cols, XprType> type; -}; - -} // end namespace internal - -template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> -class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested<TensorVolumePatchOp>::type Nested; - typedef typename Eigen::internal::traits<TensorVolumePatchOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols, - DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides, - DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides, - DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, - PaddingType padding_type, Scalar padding_value) - : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols), - m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides), - m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), - m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), - m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0), - m_padding_type(padding_type), m_padding_value(padding_value) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols, - DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides, - DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides, - DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, - DenseIndex padding_top_z, DenseIndex padding_bottom_z, - DenseIndex padding_top, DenseIndex padding_bottom, - DenseIndex padding_left, DenseIndex padding_right, - Scalar padding_value) - : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols), - m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides), - m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), - m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), - m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom), - m_padding_left(padding_left), m_padding_right(padding_right), - m_padding_type(PADDING_VALID), m_padding_value(padding_value) {} - - EIGEN_DEVICE_FUNC - DenseIndex patch_planes() const { return m_patch_planes; } - EIGEN_DEVICE_FUNC - DenseIndex patch_rows() const { return m_patch_rows; } - EIGEN_DEVICE_FUNC - DenseIndex patch_cols() const { return m_patch_cols; } - EIGEN_DEVICE_FUNC - DenseIndex plane_strides() const { return m_plane_strides; } - EIGEN_DEVICE_FUNC - DenseIndex row_strides() const { return m_row_strides; } - EIGEN_DEVICE_FUNC - DenseIndex col_strides() const { return m_col_strides; } - EIGEN_DEVICE_FUNC - DenseIndex in_plane_strides() const { return m_in_plane_strides; } - EIGEN_DEVICE_FUNC - DenseIndex in_row_strides() const { return m_in_row_strides; } - EIGEN_DEVICE_FUNC - DenseIndex in_col_strides() const { return m_in_col_strides; } - EIGEN_DEVICE_FUNC - DenseIndex plane_inflate_strides() const { return m_plane_inflate_strides; } - EIGEN_DEVICE_FUNC - DenseIndex row_inflate_strides() const { return m_row_inflate_strides; } - EIGEN_DEVICE_FUNC - DenseIndex col_inflate_strides() const { return m_col_inflate_strides; } - EIGEN_DEVICE_FUNC - bool padding_explicit() const { return m_padding_explicit; } - EIGEN_DEVICE_FUNC - DenseIndex padding_top_z() const { return m_padding_top_z; } - EIGEN_DEVICE_FUNC - DenseIndex padding_bottom_z() const { return m_padding_bottom_z; } - EIGEN_DEVICE_FUNC - DenseIndex padding_top() const { return m_padding_top; } - EIGEN_DEVICE_FUNC - DenseIndex padding_bottom() const { return m_padding_bottom; } - EIGEN_DEVICE_FUNC - DenseIndex padding_left() const { return m_padding_left; } - EIGEN_DEVICE_FUNC - DenseIndex padding_right() const { return m_padding_right; } - EIGEN_DEVICE_FUNC - PaddingType padding_type() const { return m_padding_type; } - EIGEN_DEVICE_FUNC - Scalar padding_value() const { return m_padding_value; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const DenseIndex m_patch_planes; - const DenseIndex m_patch_rows; - const DenseIndex m_patch_cols; - const DenseIndex m_plane_strides; - const DenseIndex m_row_strides; - const DenseIndex m_col_strides; - const DenseIndex m_in_plane_strides; - const DenseIndex m_in_row_strides; - const DenseIndex m_in_col_strides; - const DenseIndex m_plane_inflate_strides; - const DenseIndex m_row_inflate_strides; - const DenseIndex m_col_inflate_strides; - const bool m_padding_explicit; - const DenseIndex m_padding_top_z; - const DenseIndex m_padding_bottom_z; - const DenseIndex m_padding_top; - const DenseIndex m_padding_bottom; - const DenseIndex m_padding_left; - const DenseIndex m_padding_right; - const PaddingType m_padding_type; - const Scalar m_padding_value; -}; - - -// Eval as rvalue -template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device> -struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, Device> -{ - typedef TensorVolumePatchOp<Planes, Rows, Cols, ArgType> XprType; - typedef typename XprType::Index Index; - static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; - static const int NumDims = NumInputDims + 1; - typedef DSizes<Index, NumDims> Dimensions; - typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef StorageMemory<CoeffReturnType, Device> Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - IsAligned = false, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = false, - PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : - m_impl(op.expression(), device) - { - EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE); - - m_paddingValue = op.padding_value(); - - const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); - - // Cache a few variables. - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - m_inputDepth = input_dims[0]; - m_inputPlanes = input_dims[1]; - m_inputRows = input_dims[2]; - m_inputCols = input_dims[3]; - } else { - m_inputDepth = input_dims[NumInputDims-1]; - m_inputPlanes = input_dims[NumInputDims-2]; - m_inputRows = input_dims[NumInputDims-3]; - m_inputCols = input_dims[NumInputDims-4]; - } - - m_plane_strides = op.plane_strides(); - m_row_strides = op.row_strides(); - m_col_strides = op.col_strides(); - - // Input strides and effective input/patch size - m_in_plane_strides = op.in_plane_strides(); - m_in_row_strides = op.in_row_strides(); - m_in_col_strides = op.in_col_strides(); - m_plane_inflate_strides = op.plane_inflate_strides(); - m_row_inflate_strides = op.row_inflate_strides(); - m_col_inflate_strides = op.col_inflate_strides(); - - // The "effective" spatial size after inflating data with zeros. - m_input_planes_eff = (m_inputPlanes - 1) * m_plane_inflate_strides + 1; - m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1; - m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1; - m_patch_planes_eff = op.patch_planes() + (op.patch_planes() - 1) * (m_in_plane_strides - 1); - m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1); - m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1); - - if (op.padding_explicit()) { - m_outputPlanes = numext::ceil((m_input_planes_eff + op.padding_top_z() + op.padding_bottom_z() - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides)); - m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides)); - m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides)); - m_planePaddingTop = op.padding_top_z(); - m_rowPaddingTop = op.padding_top(); - m_colPaddingLeft = op.padding_left(); - } else { - // Computing padding from the type - switch (op.padding_type()) { - case PADDING_VALID: - m_outputPlanes = numext::ceil((m_input_planes_eff - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides)); - m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides)); - m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides)); - m_planePaddingTop = 0; - m_rowPaddingTop = 0; - m_colPaddingLeft = 0; - break; - case PADDING_SAME: { - m_outputPlanes = numext::ceil(m_input_planes_eff / static_cast<float>(m_plane_strides)); - m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides)); - m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides)); - const Index dz = (m_outputPlanes - 1) * m_plane_strides + m_patch_planes_eff - m_input_planes_eff; - const Index dy = (m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff; - const Index dx = (m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff; - m_planePaddingTop = dz / 2; - m_rowPaddingTop = dy / 2; - m_colPaddingLeft = dx / 2; - break; - } - default: - eigen_assert(false && "unexpected padding"); - } - } - eigen_assert(m_outputRows > 0); - eigen_assert(m_outputCols > 0); - eigen_assert(m_outputPlanes > 0); - - // Dimensions for result of extraction. - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - // ColMajor - // 0: depth - // 1: patch_planes - // 2: patch_rows - // 3: patch_cols - // 4: number of patches - // 5 and beyond: anything else (such as batch). - m_dimensions[0] = input_dims[0]; - m_dimensions[1] = op.patch_planes(); - m_dimensions[2] = op.patch_rows(); - m_dimensions[3] = op.patch_cols(); - m_dimensions[4] = m_outputPlanes * m_outputRows * m_outputCols; - for (int i = 5; i < NumDims; ++i) { - m_dimensions[i] = input_dims[i-1]; - } - } else { - // RowMajor - // NumDims-1: depth - // NumDims-2: patch_planes - // NumDims-3: patch_rows - // NumDims-4: patch_cols - // NumDims-5: number of patches - // NumDims-6 and beyond: anything else (such as batch). - m_dimensions[NumDims-1] = input_dims[NumInputDims-1]; - m_dimensions[NumDims-2] = op.patch_planes(); - m_dimensions[NumDims-3] = op.patch_rows(); - m_dimensions[NumDims-4] = op.patch_cols(); - m_dimensions[NumDims-5] = m_outputPlanes * m_outputRows * m_outputCols; - for (int i = NumDims-6; i >= 0; --i) { - m_dimensions[i] = input_dims[i]; - } - } - - // Strides for the output tensor. - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - m_rowStride = m_dimensions[1]; - m_colStride = m_dimensions[2] * m_rowStride; - m_patchStride = m_colStride * m_dimensions[3] * m_dimensions[0]; - m_otherStride = m_patchStride * m_dimensions[4]; - } else { - m_rowStride = m_dimensions[NumDims-2]; - m_colStride = m_dimensions[NumDims-3] * m_rowStride; - m_patchStride = m_colStride * m_dimensions[NumDims-4] * m_dimensions[NumDims-1]; - m_otherStride = m_patchStride * m_dimensions[NumDims-5]; - } - - // Strides for navigating through the input tensor. - m_planeInputStride = m_inputDepth; - m_rowInputStride = m_inputDepth * m_inputPlanes; - m_colInputStride = m_inputDepth * m_inputRows * m_inputPlanes; - m_otherInputStride = m_inputDepth * m_inputRows * m_inputCols * m_inputPlanes; - - m_outputPlanesRows = m_outputPlanes * m_outputRows; - - // Fast representations of different variables. - m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride); - - m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride); - m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride); - m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride); - m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides); - m_fastInputColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides); - m_fastInputPlaneStride = internal::TensorIntDivisor<Index>(m_plane_inflate_strides); - m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff); - m_fastOutputPlanes = internal::TensorIntDivisor<Index>(m_outputPlanes); - m_fastOutputPlanesRows = internal::TensorIntDivisor<Index>(m_outputPlanesRows); - - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]); - } else { - m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - - EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - // Patch index corresponding to the passed in index. - const Index patchIndex = index / m_fastPatchStride; - - // Spatial offset within the patch. This has to be translated into 3D - // coordinates within the patch. - const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth; - - // Batch, etc. - const Index otherIndex = (NumDims == 5) ? 0 : index / m_fastOtherStride; - const Index patch3DIndex = (NumDims == 5) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride; - - // Calculate column index in the input original tensor. - const Index colIndex = patch3DIndex / m_fastOutputPlanesRows; - const Index colOffset = patchOffset / m_fastColStride; - const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft; - const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); - if (inputCol < 0 || inputCol >= m_input_cols_eff || - ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) { - return Scalar(m_paddingValue); - } - - // Calculate row index in the original input tensor. - const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes; - const Index rowOffset = (patchOffset - colOffset * m_colStride) / m_fastRowStride; - const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop; - const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0); - if (inputRow < 0 || inputRow >= m_input_rows_eff || - ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) { - return Scalar(m_paddingValue); - } - - // Calculate plane index in the original input tensor. - const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex)); - const Index planeOffset = patchOffset - colOffset * m_colStride - rowOffset * m_rowStride; - const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop; - const Index origInputPlane = (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0); - if (inputPlane < 0 || inputPlane >= m_input_planes_eff || - ((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) { - return Scalar(m_paddingValue); - } - - const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1; - const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; - - const Index inputIndex = depth + - origInputRow * m_rowInputStride + - origInputCol * m_colInputStride + - origInputPlane * m_planeInputStride + - otherIndex * m_otherInputStride; - - return m_impl.coeff(inputIndex); - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 || - m_in_plane_strides != 1 || m_plane_inflate_strides != 1) { - return packetWithPossibleZero(index); - } - - const Index indices[2] = {index, index + PacketSize - 1}; - const Index patchIndex = indices[0] / m_fastPatchStride; - if (patchIndex != indices[1] / m_fastPatchStride) { - return packetWithPossibleZero(index); - } - const Index otherIndex = (NumDims == 5) ? 0 : indices[0] / m_fastOtherStride; - eigen_assert(otherIndex == indices[1] / m_fastOtherStride); - - // Find the offset of the element wrt the location of the first element. - const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth, - (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth}; - - const Index patch3DIndex = (NumDims == 5) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride; - eigen_assert(patch3DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride); - - const Index colIndex = patch3DIndex / m_fastOutputPlanesRows; - const Index colOffsets[2] = { - patchOffsets[0] / m_fastColStride, - patchOffsets[1] / m_fastColStride}; - - // Calculate col indices in the original input tensor. - const Index inputCols[2] = { - colIndex * m_col_strides + colOffsets[0] - m_colPaddingLeft, - colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft}; - if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) { - return internal::pset1<PacketReturnType>(Scalar(m_paddingValue)); - } - - if (inputCols[0] != inputCols[1]) { - return packetWithPossibleZero(index); - } - - const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes; - const Index rowOffsets[2] = { - (patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride, - (patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride}; - eigen_assert(rowOffsets[0] <= rowOffsets[1]); - // Calculate col indices in the original input tensor. - const Index inputRows[2] = { - rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop, - rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop}; - - if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) { - return internal::pset1<PacketReturnType>(Scalar(m_paddingValue)); - } - - if (inputRows[0] != inputRows[1]) { - return packetWithPossibleZero(index); - } - - const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex)); - const Index planeOffsets[2] = { - patchOffsets[0] - colOffsets[0] * m_colStride - rowOffsets[0] * m_rowStride, - patchOffsets[1] - colOffsets[1] * m_colStride - rowOffsets[1] * m_rowStride}; - eigen_assert(planeOffsets[0] <= planeOffsets[1]); - const Index inputPlanes[2] = { - planeIndex * m_plane_strides + planeOffsets[0] - m_planePaddingTop, - planeIndex * m_plane_strides + planeOffsets[1] - m_planePaddingTop}; - - if (inputPlanes[1] < 0 || inputPlanes[0] >= m_inputPlanes) { - return internal::pset1<PacketReturnType>(Scalar(m_paddingValue)); - } - - if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) { - // no padding - const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1; - const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; - const Index inputIndex = depth + - inputRows[0] * m_rowInputStride + - inputCols[0] * m_colInputStride + - m_planeInputStride * inputPlanes[0] + - otherIndex * m_otherInputStride; - return m_impl.template packet<Unaligned>(inputIndex); - } - - return packetWithPossibleZero(index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double compute_cost = - 10 * TensorOpCost::DivCost<Index>() + 21 * TensorOpCost::MulCost<Index>() + - 8 * TensorOpCost::AddCost<Index>(); - return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - - const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } - - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planePaddingTop() const { return m_planePaddingTop; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputPlanes() const { return m_outputPlanes; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userPlaneStride() const { return m_plane_strides; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInPlaneStride() const { return m_in_plane_strides; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planeInflateStride() const { return m_plane_inflate_strides; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const - { - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; - EIGEN_UNROLL_LOOP - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload<PacketReturnType>(values); - return rslt; - } - - Dimensions m_dimensions; - - // Parameters passed to the constructor. - Index m_plane_strides; - Index m_row_strides; - Index m_col_strides; - - Index m_outputPlanes; - Index m_outputRows; - Index m_outputCols; - - Index m_planePaddingTop; - Index m_rowPaddingTop; - Index m_colPaddingLeft; - - Index m_in_plane_strides; - Index m_in_row_strides; - Index m_in_col_strides; - - Index m_plane_inflate_strides; - Index m_row_inflate_strides; - Index m_col_inflate_strides; - - // Cached input size. - Index m_inputDepth; - Index m_inputPlanes; - Index m_inputRows; - Index m_inputCols; - - // Other cached variables. - Index m_outputPlanesRows; - - // Effective input/patch post-inflation size. - Index m_input_planes_eff; - Index m_input_rows_eff; - Index m_input_cols_eff; - Index m_patch_planes_eff; - Index m_patch_rows_eff; - Index m_patch_cols_eff; - - // Strides for the output tensor. - Index m_otherStride; - Index m_patchStride; - Index m_rowStride; - Index m_colStride; - - // Strides for the input tensor. - Index m_planeInputStride; - Index m_rowInputStride; - Index m_colInputStride; - Index m_otherInputStride; - - internal::TensorIntDivisor<Index> m_fastOtherStride; - internal::TensorIntDivisor<Index> m_fastPatchStride; - internal::TensorIntDivisor<Index> m_fastColStride; - internal::TensorIntDivisor<Index> m_fastRowStride; - internal::TensorIntDivisor<Index> m_fastInputPlaneStride; - internal::TensorIntDivisor<Index> m_fastInputRowStride; - internal::TensorIntDivisor<Index> m_fastInputColStride; - internal::TensorIntDivisor<Index> m_fastInputColsEff; - internal::TensorIntDivisor<Index> m_fastOutputPlanesRows; - internal::TensorIntDivisor<Index> m_fastOutputPlanes; - internal::TensorIntDivisor<Index> m_fastOutputDepth; - - Scalar m_paddingValue; - - TensorEvaluator<ArgType, Device> m_impl; - - -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H diff --git a/src/EigenUnsupported/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/src/EigenUnsupported/CXX11/src/TensorSymmetry/DynamicSymmetry.h deleted file mode 100644 index bc4f202..0000000 --- a/src/EigenUnsupported/CXX11/src/TensorSymmetry/DynamicSymmetry.h +++ /dev/null @@ -1,293 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H -#define EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H - -namespace Eigen { - -class DynamicSGroup -{ - public: - inline explicit DynamicSGroup() : m_numIndices(1), m_elements(), m_generators(), m_globalFlags(0) { m_elements.push_back(ge(Generator(0, 0, 0))); } - inline DynamicSGroup(const DynamicSGroup& o) : m_numIndices(o.m_numIndices), m_elements(o.m_elements), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { } - inline DynamicSGroup(DynamicSGroup&& o) : m_numIndices(o.m_numIndices), m_elements(), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { std::swap(m_elements, o.m_elements); } - inline DynamicSGroup& operator=(const DynamicSGroup& o) { m_numIndices = o.m_numIndices; m_elements = o.m_elements; m_generators = o.m_generators; m_globalFlags = o.m_globalFlags; return *this; } - inline DynamicSGroup& operator=(DynamicSGroup&& o) { m_numIndices = o.m_numIndices; std::swap(m_elements, o.m_elements); m_generators = o.m_generators; m_globalFlags = o.m_globalFlags; return *this; } - - void add(int one, int two, int flags = 0); - - template<typename Gen_> - inline void add(Gen_) { add(Gen_::One, Gen_::Two, Gen_::Flags); } - inline void addSymmetry(int one, int two) { add(one, two, 0); } - inline void addAntiSymmetry(int one, int two) { add(one, two, NegationFlag); } - inline void addHermiticity(int one, int two) { add(one, two, ConjugationFlag); } - inline void addAntiHermiticity(int one, int two) { add(one, two, NegationFlag | ConjugationFlag); } - - template<typename Op, typename RV, typename Index, std::size_t N, typename... Args> - inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args) const - { - eigen_assert(N >= m_numIndices && "Can only apply symmetry group to objects that have at least the required amount of indices."); - for (std::size_t i = 0; i < size(); i++) - initial = Op::run(h_permute(i, idx, typename internal::gen_numeric_list<int, N>::type()), m_elements[i].flags, initial, std::forward<Args>(args)...); - return initial; - } - - template<typename Op, typename RV, typename Index, typename... Args> - inline RV apply(const std::vector<Index>& idx, RV initial, Args&&... args) const - { - eigen_assert(idx.size() >= m_numIndices && "Can only apply symmetry group to objects that have at least the required amount of indices."); - for (std::size_t i = 0; i < size(); i++) - initial = Op::run(h_permute(i, idx), m_elements[i].flags, initial, std::forward<Args>(args)...); - return initial; - } - - inline int globalFlags() const { return m_globalFlags; } - inline std::size_t size() const { return m_elements.size(); } - - template<typename Tensor_, typename... IndexTypes> - inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const - { - static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}}); - } - - template<typename Tensor_> - inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices> const& indices) const - { - return internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup>(tensor, *this, indices); - } - private: - struct GroupElement { - std::vector<int> representation; - int flags; - bool isId() const - { - for (std::size_t i = 0; i < representation.size(); i++) - if (i != (size_t)representation[i]) - return false; - return true; - } - }; - struct Generator { - int one; - int two; - int flags; - constexpr inline Generator(int one_, int two_, int flags_) : one(one_), two(two_), flags(flags_) {} - }; - - std::size_t m_numIndices; - std::vector<GroupElement> m_elements; - std::vector<Generator> m_generators; - int m_globalFlags; - - template<typename Index, std::size_t N, int... n> - inline std::array<Index, N> h_permute(std::size_t which, const std::array<Index, N>& idx, internal::numeric_list<int, n...>) const - { - return std::array<Index, N>{{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }}; - } - - template<typename Index> - inline std::vector<Index> h_permute(std::size_t which, std::vector<Index> idx) const - { - std::vector<Index> result; - result.reserve(idx.size()); - for (auto k : m_elements[which].representation) - result.push_back(idx[k]); - for (std::size_t i = m_numIndices; i < idx.size(); i++) - result.push_back(idx[i]); - return result; - } - - inline GroupElement ge(Generator const& g) const - { - GroupElement result; - result.representation.reserve(m_numIndices); - result.flags = g.flags; - for (std::size_t k = 0; k < m_numIndices; k++) { - if (k == (std::size_t)g.one) - result.representation.push_back(g.two); - else if (k == (std::size_t)g.two) - result.representation.push_back(g.one); - else - result.representation.push_back(int(k)); - } - return result; - } - - GroupElement mul(GroupElement, GroupElement) const; - inline GroupElement mul(Generator g1, GroupElement g2) const - { - return mul(ge(g1), g2); - } - - inline GroupElement mul(GroupElement g1, Generator g2) const - { - return mul(g1, ge(g2)); - } - - inline GroupElement mul(Generator g1, Generator g2) const - { - return mul(ge(g1), ge(g2)); - } - - inline int findElement(GroupElement e) const - { - for (auto ee : m_elements) { - if (ee.representation == e.representation) - return ee.flags ^ e.flags; - } - return -1; - } - - void updateGlobalFlags(int flagDiffOfSameGenerator); -}; - -// dynamic symmetry group that auto-adds the template parameters in the constructor -template<typename... Gen> -class DynamicSGroupFromTemplateArgs : public DynamicSGroup -{ - public: - inline DynamicSGroupFromTemplateArgs() : DynamicSGroup() - { - add_all(internal::type_list<Gen...>()); - } - inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs const& other) : DynamicSGroup(other) { } - inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs&& other) : DynamicSGroup(other) { } - inline DynamicSGroupFromTemplateArgs<Gen...>& operator=(const DynamicSGroupFromTemplateArgs<Gen...>& o) { DynamicSGroup::operator=(o); return *this; } - inline DynamicSGroupFromTemplateArgs<Gen...>& operator=(DynamicSGroupFromTemplateArgs<Gen...>&& o) { DynamicSGroup::operator=(o); return *this; } - - private: - template<typename Gen1, typename... GenNext> - inline void add_all(internal::type_list<Gen1, GenNext...>) - { - add(Gen1()); - add_all(internal::type_list<GenNext...>()); - } - - inline void add_all(internal::type_list<>) - { - } -}; - -inline DynamicSGroup::GroupElement DynamicSGroup::mul(GroupElement g1, GroupElement g2) const -{ - eigen_internal_assert(g1.representation.size() == m_numIndices); - eigen_internal_assert(g2.representation.size() == m_numIndices); - - GroupElement result; - result.representation.reserve(m_numIndices); - for (std::size_t i = 0; i < m_numIndices; i++) { - int v = g2.representation[g1.representation[i]]; - eigen_assert(v >= 0); - result.representation.push_back(v); - } - result.flags = g1.flags ^ g2.flags; - return result; -} - -inline void DynamicSGroup::add(int one, int two, int flags) -{ - eigen_assert(one >= 0); - eigen_assert(two >= 0); - eigen_assert(one != two); - - if ((std::size_t)one >= m_numIndices || (std::size_t)two >= m_numIndices) { - std::size_t newNumIndices = (one > two) ? one : two + 1; - for (auto& gelem : m_elements) { - gelem.representation.reserve(newNumIndices); - for (std::size_t i = m_numIndices; i < newNumIndices; i++) - gelem.representation.push_back(i); - } - m_numIndices = newNumIndices; - } - - Generator g{one, two, flags}; - GroupElement e = ge(g); - - /* special case for first generator */ - if (m_elements.size() == 1) { - while (!e.isId()) { - m_elements.push_back(e); - e = mul(e, g); - } - - if (e.flags > 0) - updateGlobalFlags(e.flags); - - // only add in case we didn't have identity - if (m_elements.size() > 1) - m_generators.push_back(g); - return; - } - - int p = findElement(e); - if (p >= 0) { - updateGlobalFlags(p); - return; - } - - std::size_t coset_order = m_elements.size(); - m_elements.push_back(e); - for (std::size_t i = 1; i < coset_order; i++) - m_elements.push_back(mul(m_elements[i], e)); - m_generators.push_back(g); - - std::size_t coset_rep = coset_order; - do { - for (auto g : m_generators) { - e = mul(m_elements[coset_rep], g); - p = findElement(e); - if (p < 0) { - // element not yet in group - m_elements.push_back(e); - for (std::size_t i = 1; i < coset_order; i++) - m_elements.push_back(mul(m_elements[i], e)); - } else if (p > 0) { - updateGlobalFlags(p); - } - } - coset_rep += coset_order; - } while (coset_rep < m_elements.size()); -} - -inline void DynamicSGroup::updateGlobalFlags(int flagDiffOfSameGenerator) -{ - switch (flagDiffOfSameGenerator) { - case 0: - default: - // nothing happened - break; - case NegationFlag: - // every element is it's own negative => whole tensor is zero - m_globalFlags |= GlobalZeroFlag; - break; - case ConjugationFlag: - // every element is it's own conjugate => whole tensor is real - m_globalFlags |= GlobalRealFlag; - break; - case (NegationFlag | ConjugationFlag): - // every element is it's own negative conjugate => whole tensor is imaginary - m_globalFlags |= GlobalImagFlag; - break; - /* NOTE: - * since GlobalZeroFlag == GlobalRealFlag | GlobalImagFlag, if one generator - * causes the tensor to be real and the next one to be imaginary, this will - * trivially give the correct result - */ - } -} - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H - -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ diff --git a/src/EigenUnsupported/CXX11/src/TensorSymmetry/StaticSymmetry.h b/src/EigenUnsupported/CXX11/src/TensorSymmetry/StaticSymmetry.h deleted file mode 100644 index 942293b..0000000 --- a/src/EigenUnsupported/CXX11/src/TensorSymmetry/StaticSymmetry.h +++ /dev/null @@ -1,236 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H -#define EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H - -namespace Eigen { - -namespace internal { - -template<typename list> struct tensor_static_symgroup_permutate; - -template<int... nn> -struct tensor_static_symgroup_permutate<numeric_list<int, nn...>> -{ - constexpr static std::size_t N = sizeof...(nn); - - template<typename T> - constexpr static inline std::array<T, N> run(const std::array<T, N>& indices) - { - return {{indices[nn]...}}; - } -}; - -template<typename indices_, int flags_> -struct tensor_static_symgroup_element -{ - typedef indices_ indices; - constexpr static int flags = flags_; -}; - -template<typename Gen, int N> -struct tensor_static_symgroup_element_ctor -{ - typedef tensor_static_symgroup_element< - typename gen_numeric_list_swapped_pair<int, N, Gen::One, Gen::Two>::type, - Gen::Flags - > type; -}; - -template<int N> -struct tensor_static_symgroup_identity_ctor -{ - typedef tensor_static_symgroup_element< - typename gen_numeric_list<int, N>::type, - 0 - > type; -}; - -template<typename iib> -struct tensor_static_symgroup_multiply_helper -{ - template<int... iia> - constexpr static inline numeric_list<int, get<iia, iib>::value...> helper(numeric_list<int, iia...>) { - return numeric_list<int, get<iia, iib>::value...>(); - } -}; - -template<typename A, typename B> -struct tensor_static_symgroup_multiply -{ - private: - typedef typename A::indices iia; - typedef typename B::indices iib; - constexpr static int ffa = A::flags; - constexpr static int ffb = B::flags; - - public: - static_assert(iia::count == iib::count, "Cannot multiply symmetry elements with different number of indices."); - - typedef tensor_static_symgroup_element< - decltype(tensor_static_symgroup_multiply_helper<iib>::helper(iia())), - ffa ^ ffb - > type; -}; - -template<typename A, typename B> -struct tensor_static_symgroup_equality -{ - typedef typename A::indices iia; - typedef typename B::indices iib; - constexpr static int ffa = A::flags; - constexpr static int ffb = B::flags; - static_assert(iia::count == iib::count, "Cannot compare symmetry elements with different number of indices."); - - constexpr static bool value = is_same<iia, iib>::value; - - private: - /* this should be zero if they are identical, or else the tensor - * will be forced to be pure real, pure imaginary or even pure zero - */ - constexpr static int flags_cmp_ = ffa ^ ffb; - - /* either they are not equal, then we don't care whether the flags - * match, or they are equal, and then we have to check - */ - constexpr static bool is_zero = value && flags_cmp_ == NegationFlag; - constexpr static bool is_real = value && flags_cmp_ == ConjugationFlag; - constexpr static bool is_imag = value && flags_cmp_ == (NegationFlag | ConjugationFlag); - - public: - constexpr static int global_flags = - (is_real ? GlobalRealFlag : 0) | - (is_imag ? GlobalImagFlag : 0) | - (is_zero ? GlobalZeroFlag : 0); -}; - -template<std::size_t NumIndices, typename... Gen> -struct tensor_static_symgroup -{ - typedef StaticSGroup<Gen...> type; - constexpr static std::size_t size = type::static_size; -}; - -template<typename Index, std::size_t N, int... ii, int... jj> -constexpr static inline std::array<Index, N> tensor_static_symgroup_index_permute(std::array<Index, N> idx, internal::numeric_list<int, ii...>, internal::numeric_list<int, jj...>) -{ - return {{ idx[ii]..., idx[jj]... }}; -} - -template<typename Index, int... ii> -static inline std::vector<Index> tensor_static_symgroup_index_permute(std::vector<Index> idx, internal::numeric_list<int, ii...>) -{ - std::vector<Index> result{{ idx[ii]... }}; - std::size_t target_size = idx.size(); - for (std::size_t i = result.size(); i < target_size; i++) - result.push_back(idx[i]); - return result; -} - -template<typename T> struct tensor_static_symgroup_do_apply; - -template<typename first, typename... next> -struct tensor_static_symgroup_do_apply<internal::type_list<first, next...>> -{ - template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, std::size_t NumIndices, typename... Args> - static inline RV run(const std::array<Index, NumIndices>& idx, RV initial, Args&&... args) - { - static_assert(NumIndices >= SGNumIndices, "Can only apply symmetry group to objects that have at least the required amount of indices."); - typedef typename internal::gen_numeric_list<int, NumIndices - SGNumIndices, SGNumIndices>::type remaining_indices; - initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices(), remaining_indices()), first::flags, initial, std::forward<Args>(args)...); - return tensor_static_symgroup_do_apply<internal::type_list<next...>>::template run<Op, RV, SGNumIndices>(idx, initial, args...); - } - - template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, typename... Args> - static inline RV run(const std::vector<Index>& idx, RV initial, Args&&... args) - { - eigen_assert(idx.size() >= SGNumIndices && "Can only apply symmetry group to objects that have at least the required amount of indices."); - initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices()), first::flags, initial, std::forward<Args>(args)...); - return tensor_static_symgroup_do_apply<internal::type_list<next...>>::template run<Op, RV, SGNumIndices>(idx, initial, args...); - } -}; - -template<EIGEN_TPL_PP_SPEC_HACK_DEF(typename, empty)> -struct tensor_static_symgroup_do_apply<internal::type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>> -{ - template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, std::size_t NumIndices, typename... Args> - static inline RV run(const std::array<Index, NumIndices>&, RV initial, Args&&...) - { - // do nothing - return initial; - } - - template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, typename... Args> - static inline RV run(const std::vector<Index>&, RV initial, Args&&...) - { - // do nothing - return initial; - } -}; - -} // end namespace internal - -template<typename... Gen> -class StaticSGroup -{ - constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices<Gen...>::value; - typedef internal::group_theory::enumerate_group_elements< - internal::tensor_static_symgroup_multiply, - internal::tensor_static_symgroup_equality, - typename internal::tensor_static_symgroup_identity_ctor<NumIndices>::type, - internal::type_list<typename internal::tensor_static_symgroup_element_ctor<Gen, NumIndices>::type...> - > group_elements; - typedef typename group_elements::type ge; - public: - constexpr inline StaticSGroup() {} - constexpr inline StaticSGroup(const StaticSGroup<Gen...>&) {} - constexpr inline StaticSGroup(StaticSGroup<Gen...>&&) {} - - template<typename Op, typename RV, typename Index, std::size_t N, typename... Args> - static inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args) - { - return internal::tensor_static_symgroup_do_apply<ge>::template run<Op, RV, NumIndices>(idx, initial, args...); - } - - template<typename Op, typename RV, typename Index, typename... Args> - static inline RV apply(const std::vector<Index>& idx, RV initial, Args&&... args) - { - eigen_assert(idx.size() == NumIndices); - return internal::tensor_static_symgroup_do_apply<ge>::template run<Op, RV, NumIndices>(idx, initial, args...); - } - - constexpr static std::size_t static_size = ge::count; - - constexpr static inline std::size_t size() { - return ge::count; - } - constexpr static inline int globalFlags() { return group_elements::global_flags; } - - template<typename Tensor_, typename... IndexTypes> - inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const - { - static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}}); - } - - template<typename Tensor_> - inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(Tensor_& tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices> const& indices) const - { - return internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>>(tensor, *this, indices); - } -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H - -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ diff --git a/src/EigenUnsupported/CXX11/src/TensorSymmetry/Symmetry.h b/src/EigenUnsupported/CXX11/src/TensorSymmetry/Symmetry.h deleted file mode 100644 index 879d6cd..0000000 --- a/src/EigenUnsupported/CXX11/src/TensorSymmetry/Symmetry.h +++ /dev/null @@ -1,338 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H -#define EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H - -namespace Eigen { - -enum { - NegationFlag = 0x01, - ConjugationFlag = 0x02 -}; - -enum { - GlobalRealFlag = 0x01, - GlobalImagFlag = 0x02, - GlobalZeroFlag = 0x03 -}; - -namespace internal { - -template<std::size_t NumIndices, typename... Sym> struct tensor_symmetry_pre_analysis; -template<std::size_t NumIndices, typename... Sym> struct tensor_static_symgroup; -template<bool instantiate, std::size_t NumIndices, typename... Sym> struct tensor_static_symgroup_if; -template<typename Tensor_> struct tensor_symmetry_calculate_flags; -template<typename Tensor_> struct tensor_symmetry_assign_value; -template<typename... Sym> struct tensor_symmetry_num_indices; - -} // end namespace internal - -template<int One_, int Two_> -struct Symmetry -{ - static_assert(One_ != Two_, "Symmetries must cover distinct indices."); - constexpr static int One = One_; - constexpr static int Two = Two_; - constexpr static int Flags = 0; -}; - -template<int One_, int Two_> -struct AntiSymmetry -{ - static_assert(One_ != Two_, "Symmetries must cover distinct indices."); - constexpr static int One = One_; - constexpr static int Two = Two_; - constexpr static int Flags = NegationFlag; -}; - -template<int One_, int Two_> -struct Hermiticity -{ - static_assert(One_ != Two_, "Symmetries must cover distinct indices."); - constexpr static int One = One_; - constexpr static int Two = Two_; - constexpr static int Flags = ConjugationFlag; -}; - -template<int One_, int Two_> -struct AntiHermiticity -{ - static_assert(One_ != Two_, "Symmetries must cover distinct indices."); - constexpr static int One = One_; - constexpr static int Two = Two_; - constexpr static int Flags = ConjugationFlag | NegationFlag; -}; - -/** \class DynamicSGroup - * \ingroup TensorSymmetry_Module - * - * \brief Dynamic symmetry group - * - * The %DynamicSGroup class represents a symmetry group that need not be known at - * compile time. It is useful if one wants to support arbitrary run-time defineable - * symmetries for tensors, but it is also instantiated if a symmetry group is defined - * at compile time that would be either too large for the compiler to reasonably - * generate (using templates to calculate this at compile time is very inefficient) - * or that the compiler could generate the group but that it wouldn't make sense to - * unroll the loop for setting coefficients anymore. - */ -class DynamicSGroup; - -/** \internal - * - * \class DynamicSGroupFromTemplateArgs - * \ingroup TensorSymmetry_Module - * - * \brief Dynamic symmetry group, initialized from template arguments - * - * This class is a child class of DynamicSGroup. It uses the template arguments - * specified to initialize itself. - */ -template<typename... Gen> -class DynamicSGroupFromTemplateArgs; - -/** \class StaticSGroup - * \ingroup TensorSymmetry_Module - * - * \brief Static symmetry group - * - * This class represents a symmetry group that is known and resolved completely - * at compile time. Ideally, no run-time penalty is incurred compared to the - * manual unrolling of the symmetry. - * - * <b><i>CAUTION:</i></b> - * - * Do not use this class directly for large symmetry groups. The compiler - * may run into a limit, or segfault or in the very least will take a very, - * very, very long time to compile the code. Use the SGroup class instead - * if you want a static group. That class contains logic that will - * automatically select the DynamicSGroup class instead if the symmetry - * group becomes too large. (In that case, unrolling may not even be - * beneficial.) - */ -template<typename... Gen> -class StaticSGroup; - -/** \class SGroup - * \ingroup TensorSymmetry_Module - * - * \brief Symmetry group, initialized from template arguments - * - * This class represents a symmetry group whose generators are already - * known at compile time. It may or may not be resolved at compile time, - * depending on the estimated size of the group. - * - * \sa StaticSGroup - * \sa DynamicSGroup - */ -template<typename... Gen> -class SGroup : public internal::tensor_symmetry_pre_analysis<internal::tensor_symmetry_num_indices<Gen...>::value, Gen...>::root_type -{ - public: - constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices<Gen...>::value; - typedef typename internal::tensor_symmetry_pre_analysis<NumIndices, Gen...>::root_type Base; - - // make standard constructors + assignment operators public - inline SGroup() : Base() { } - inline SGroup(const SGroup<Gen...>& other) : Base(other) { } - inline SGroup(SGroup<Gen...>&& other) : Base(other) { } - inline SGroup<Gen...>& operator=(const SGroup<Gen...>& other) { Base::operator=(other); return *this; } - inline SGroup<Gen...>& operator=(SGroup<Gen...>&& other) { Base::operator=(other); return *this; } - - // all else is defined in the base class -}; - -namespace internal { - -template<typename... Sym> struct tensor_symmetry_num_indices -{ - constexpr static std::size_t value = 1; -}; - -template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> -{ -private: - constexpr static std::size_t One = static_cast<std::size_t>(One_); - constexpr static std::size_t Two = static_cast<std::size_t>(Two_); - constexpr static std::size_t Three = tensor_symmetry_num_indices<Sym...>::value; - - // don't use std::max, since it's not constexpr until C++14... - constexpr static std::size_t maxOneTwoPlusOne = ((One > Two) ? One : Two) + 1; -public: - constexpr static std::size_t value = (maxOneTwoPlusOne > Three) ? maxOneTwoPlusOne : Three; -}; - -template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<AntiSymmetry<One_, Two_>, Sym...> - : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {}; -template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<Hermiticity<One_, Two_>, Sym...> - : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {}; -template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<AntiHermiticity<One_, Two_>, Sym...> - : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {}; - -/** \internal - * - * \class tensor_symmetry_pre_analysis - * \ingroup TensorSymmetry_Module - * - * \brief Pre-select whether to use a static or dynamic symmetry group - * - * When a symmetry group could in principle be determined at compile time, - * this template implements the logic whether to actually do that or whether - * to rather defer that to runtime. - * - * The logic is as follows: - * <dl> - * <dt><b>No generators (trivial symmetry):</b></dt> - * <dd>Use a trivial static group. Ideally, this has no performance impact - * compared to not using symmetry at all. In practice, this might not - * be the case.</dd> - * <dt><b>More than 4 generators:</b></dt> - * <dd>Calculate the group at run time, it is likely far too large for the - * compiler to be able to properly generate it in a realistic time.</dd> - * <dt><b>Up to and including 4 generators:</b></dt> - * <dd>Actually enumerate all group elements, but then check how many there - * are. If there are more than 16, it is unlikely that unrolling the - * loop (as is done in the static compile-time case) is sensible, so - * use a dynamic group instead. If there are at most 16 elements, actually - * use that static group. Note that the largest group with 4 generators - * still compiles with reasonable resources.</dd> - * </dl> - * - * Note: Example compile time performance with g++-4.6 on an Intenl Core i5-3470 - * with 16 GiB RAM (all generators non-redundant and the subgroups don't - * factorize): - * - * # Generators -O0 -ggdb -O2 - * ------------------------------------------------------------------- - * 1 0.5 s / 250 MiB 0.45s / 230 MiB - * 2 0.5 s / 260 MiB 0.5 s / 250 MiB - * 3 0.65s / 310 MiB 0.62s / 310 MiB - * 4 2.2 s / 860 MiB 1.7 s / 770 MiB - * 5 130 s / 13000 MiB 120 s / 11000 MiB - * - * It is clear that everything is still very efficient up to 4 generators, then - * the memory and CPU requirements become unreasonable. Thus we only instantiate - * the template group theory logic if the number of generators supplied is 4 or - * lower, otherwise this will be forced to be done during runtime, where the - * algorithm is reasonably fast. - */ -template<std::size_t NumIndices> -struct tensor_symmetry_pre_analysis<NumIndices> -{ - typedef StaticSGroup<> root_type; -}; - -template<std::size_t NumIndices, typename Gen_, typename... Gens_> -struct tensor_symmetry_pre_analysis<NumIndices, Gen_, Gens_...> -{ - constexpr static std::size_t max_static_generators = 4; - constexpr static std::size_t max_static_elements = 16; - typedef tensor_static_symgroup_if<(sizeof...(Gens_) + 1 <= max_static_generators), NumIndices, Gen_, Gens_...> helper; - constexpr static std::size_t possible_size = helper::size; - - typedef typename conditional< - possible_size == 0 || possible_size >= max_static_elements, - DynamicSGroupFromTemplateArgs<Gen_, Gens_...>, - typename helper::type - >::type root_type; -}; - -template<bool instantiate, std::size_t NumIndices, typename... Gens> -struct tensor_static_symgroup_if -{ - constexpr static std::size_t size = 0; - typedef void type; -}; - -template<std::size_t NumIndices, typename... Gens> -struct tensor_static_symgroup_if<true, NumIndices, Gens...> : tensor_static_symgroup<NumIndices, Gens...> {}; - -template<typename Tensor_> -struct tensor_symmetry_assign_value -{ - typedef typename Tensor_::Index Index; - typedef typename Tensor_::Scalar Scalar; - constexpr static std::size_t NumIndices = Tensor_::NumIndices; - - static inline int run(const std::array<Index, NumIndices>& transformed_indices, int transformation_flags, int dummy, Tensor_& tensor, const Scalar& value_) - { - Scalar value(value_); - if (transformation_flags & ConjugationFlag) - value = numext::conj(value); - if (transformation_flags & NegationFlag) - value = -value; - tensor.coeffRef(transformed_indices) = value; - return dummy; - } -}; - -template<typename Tensor_> -struct tensor_symmetry_calculate_flags -{ - typedef typename Tensor_::Index Index; - constexpr static std::size_t NumIndices = Tensor_::NumIndices; - - static inline int run(const std::array<Index, NumIndices>& transformed_indices, int transform_flags, int current_flags, const std::array<Index, NumIndices>& orig_indices) - { - if (transformed_indices == orig_indices) { - if (transform_flags & (ConjugationFlag | NegationFlag)) - return current_flags | GlobalImagFlag; // anti-hermitian diagonal - else if (transform_flags & ConjugationFlag) - return current_flags | GlobalRealFlag; // hermitian diagonal - else if (transform_flags & NegationFlag) - return current_flags | GlobalZeroFlag; // anti-symmetric diagonal - } - return current_flags; - } -}; - -template<typename Tensor_, typename Symmetry_, int Flags = 0> -class tensor_symmetry_value_setter -{ - public: - typedef typename Tensor_::Index Index; - typedef typename Tensor_::Scalar Scalar; - constexpr static std::size_t NumIndices = Tensor_::NumIndices; - - inline tensor_symmetry_value_setter(Tensor_& tensor, Symmetry_ const& symmetry, std::array<Index, NumIndices> const& indices) - : m_tensor(tensor), m_symmetry(symmetry), m_indices(indices) { } - - inline tensor_symmetry_value_setter<Tensor_, Symmetry_, Flags>& operator=(Scalar const& value) - { - doAssign(value); - return *this; - } - private: - Tensor_& m_tensor; - Symmetry_ m_symmetry; - std::array<Index, NumIndices> m_indices; - - inline void doAssign(Scalar const& value) - { - #ifdef EIGEN_TENSOR_SYMMETRY_CHECK_VALUES - int value_flags = m_symmetry.template apply<internal::tensor_symmetry_calculate_flags<Tensor_>, int>(m_indices, m_symmetry.globalFlags(), m_indices); - if (value_flags & GlobalRealFlag) - eigen_assert(numext::imag(value) == 0); - if (value_flags & GlobalImagFlag) - eigen_assert(numext::real(value) == 0); - #endif - m_symmetry.template apply<internal::tensor_symmetry_assign_value<Tensor_>, int>(m_indices, 0, m_tensor, value); - } -}; - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H - -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ diff --git a/src/EigenUnsupported/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h b/src/EigenUnsupported/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h deleted file mode 100644 index 54bf9db..0000000 --- a/src/EigenUnsupported/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h +++ /dev/null @@ -1,669 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H -#define EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H - -namespace Eigen { - -namespace internal { - -namespace group_theory { - -/** \internal - * \file CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h - * This file contains C++ templates that implement group theory algorithms. - * - * The algorithms allow for a compile-time analysis of finite groups. - * - * Currently only Dimino's algorithm is implemented, which returns a list - * of all elements in a group given a set of (possibly redundant) generators. - * (One could also do that with the so-called orbital algorithm, but that - * is much more expensive and usually has no advantages.) - */ - -/********************************************************************** - * "Ok kid, here is where it gets complicated." - * - Amelia Pond in the "Doctor Who" episode - * "The Big Bang" - * - * Dimino's algorithm - * ================== - * - * The following is Dimino's algorithm in sequential form: - * - * Input: identity element, list of generators, equality check, - * multiplication operation - * Output: list of group elements - * - * 1. add identity element - * 2. remove identities from list of generators - * 3. add all powers of first generator that aren't the - * identity element - * 4. go through all remaining generators: - * a. if generator is already in the list of elements - * -> do nothing - * b. otherwise - * i. remember current # of elements - * (i.e. the size of the current subgroup) - * ii. add all current elements (which includes - * the identity) each multiplied from right - * with the current generator to the group - * iii. add all remaining cosets that are generated - * by products of the new generator with itself - * and all other generators seen so far - * - * In functional form, this is implemented as a long set of recursive - * templates that have a complicated relationship. - * - * The main interface for Dimino's algorithm is the template - * enumerate_group_elements. All lists are implemented as variadic - * type_list<typename...> and numeric_list<typename = int, int...> - * templates. - * - * 'Calling' templates is usually done via typedefs. - * - * This algorithm is an extended version of the basic version. The - * extension consists in the fact that each group element has a set - * of flags associated with it. Multiplication of two group elements - * with each other results in a group element whose flags are the - * XOR of the flags of the previous elements. Each time the algorithm - * notices that a group element it just calculated is already in the - * list of current elements, the flags of both will be compared and - * added to the so-called 'global flags' of the group. - * - * The rationale behind this extension is that this allows not only - * for the description of symmetries between tensor indices, but - * also allows for the description of hermiticity, antisymmetry and - * antihermiticity. Negation and conjugation each are specific bit - * in the flags value and if two different ways to reach a group - * element lead to two different flags, this poses a constraint on - * the allowed values of the resulting tensor. For example, if a - * group element is reach both with and without the conjugation - * flags, it is clear that the resulting tensor has to be real. - * - * Note that this flag mechanism is quite generic and may have other - * uses beyond tensor properties. - * - * IMPORTANT: - * This algorithm assumes the group to be finite. If you try to - * run it with a group that's infinite, the algorithm will only - * terminate once you hit a compiler limit (max template depth). - * Also note that trying to use this implementation to create a - * very large group will probably either make you hit the same - * limit, cause the compiler to segfault or at the very least - * take a *really* long time (hours, days, weeks - sic!) to - * compile. It is not recommended to plug in more than 4 - * generators, unless they are independent of each other. - */ - -/** \internal - * - * \class strip_identities - * \ingroup CXX11_TensorSymmetry_Module - * - * \brief Cleanse a list of group elements of the identity element - * - * This template is used to make a first pass through all initial - * generators of Dimino's algorithm and remove the identity - * elements. - * - * \sa enumerate_group_elements - */ -template<template<typename, typename> class Equality, typename id, typename L> struct strip_identities; - -template< - template<typename, typename> class Equality, - typename id, - typename t, - typename... ts -> -struct strip_identities<Equality, id, type_list<t, ts...>> -{ - typedef typename conditional< - Equality<id, t>::value, - typename strip_identities<Equality, id, type_list<ts...>>::type, - typename concat<type_list<t>, typename strip_identities<Equality, id, type_list<ts...>>::type>::type - >::type type; - constexpr static int global_flags = Equality<id, t>::global_flags | strip_identities<Equality, id, type_list<ts...>>::global_flags; -}; - -template< - template<typename, typename> class Equality, - typename id - EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, ts) -> -struct strip_identities<Equality, id, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(ts)>> -{ - typedef type_list<> type; - constexpr static int global_flags = 0; -}; - -/** \internal - * - * \class dimino_first_step_elements_helper - * \ingroup CXX11_TensorSymmetry_Module - * - * \brief Recursive template that adds powers of the first generator to the list of group elements - * - * This template calls itself recursively to add powers of the first - * generator to the list of group elements. It stops if it reaches - * the identity element again. - * - * \sa enumerate_group_elements, dimino_first_step_elements - */ -template< - template<typename, typename> class Multiply, - template<typename, typename> class Equality, - typename id, - typename g, - typename current_element, - typename elements, - bool dont_add_current_element // = false -> -struct dimino_first_step_elements_helper -#ifndef EIGEN_PARSED_BY_DOXYGEN - : // recursive inheritance is too difficult for Doxygen - public dimino_first_step_elements_helper< - Multiply, - Equality, - id, - g, - typename Multiply<current_element, g>::type, - typename concat<elements, type_list<current_element>>::type, - Equality<typename Multiply<current_element, g>::type, id>::value - > {}; - -template< - template<typename, typename> class Multiply, - template<typename, typename> class Equality, - typename id, - typename g, - typename current_element, - typename elements -> -struct dimino_first_step_elements_helper<Multiply, Equality, id, g, current_element, elements, true> -#endif // EIGEN_PARSED_BY_DOXYGEN -{ - typedef elements type; - constexpr static int global_flags = Equality<current_element, id>::global_flags; -}; - -/** \internal - * - * \class dimino_first_step_elements - * \ingroup CXX11_TensorSymmetry_Module - * - * \brief Add all powers of the first generator to the list of group elements - * - * This template takes the first non-identity generator and generates the initial - * list of elements which consists of all powers of that generator. For a group - * with just one generated, it would be enumerated after this. - * - * \sa enumerate_group_elements - */ -template< - template<typename, typename> class Multiply, - template<typename, typename> class Equality, - typename id, - typename generators -> -struct dimino_first_step_elements -{ - typedef typename get<0, generators>::type first_generator; - typedef typename skip<1, generators>::type next_generators; - typedef type_list<first_generator> generators_done; - - typedef dimino_first_step_elements_helper< - Multiply, - Equality, - id, - first_generator, - first_generator, - type_list<id>, - false - > helper; - typedef typename helper::type type; - constexpr static int global_flags = helper::global_flags; -}; - -/** \internal - * - * \class dimino_get_coset_elements - * \ingroup CXX11_TensorSymmetry_Module - * - * \brief Generate all elements of a specific coset - * - * This template generates all the elements of a specific coset by - * multiplying all elements in the given subgroup with the new - * coset representative. Note that the first element of the - * subgroup is always the identity element, so the first element of - * the result of this template is going to be the coset - * representative itself. - * - * Note that this template accepts an additional boolean parameter - * that specifies whether to actually generate the coset (true) or - * just return an empty list (false). - * - * \sa enumerate_group_elements, dimino_add_cosets_for_rep - */ -template< - template<typename, typename> class Multiply, - typename sub_group_elements, - typename new_coset_rep, - bool generate_coset // = true -> -struct dimino_get_coset_elements -{ - typedef typename apply_op_from_right<Multiply, new_coset_rep, sub_group_elements>::type type; -}; - -template< - template<typename, typename> class Multiply, - typename sub_group_elements, - typename new_coset_rep -> -struct dimino_get_coset_elements<Multiply, sub_group_elements, new_coset_rep, false> -{ - typedef type_list<> type; -}; - -/** \internal - * - * \class dimino_add_cosets_for_rep - * \ingroup CXX11_TensorSymmetry_Module - * - * \brief Recursive template for adding coset spaces - * - * This template multiplies the coset representative with a generator - * from the list of previous generators. If the new element is not in - * the group already, it adds the corresponding coset. Finally it - * proceeds to call itself with the next generator from the list. - * - * \sa enumerate_group_elements, dimino_add_all_coset_spaces - */ -template< - template<typename, typename> class Multiply, - template<typename, typename> class Equality, - typename id, - typename sub_group_elements, - typename elements, - typename generators, - typename rep_element, - int sub_group_size -> -struct dimino_add_cosets_for_rep; - -template< - template<typename, typename> class Multiply, - template<typename, typename> class Equality, - typename id, - typename sub_group_elements, - typename elements, - typename g, - typename... gs, - typename rep_element, - int sub_group_size -> -struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements, type_list<g, gs...>, rep_element, sub_group_size> -{ - typedef typename Multiply<rep_element, g>::type new_coset_rep; - typedef contained_in_list_gf<Equality, new_coset_rep, elements> _cil; - constexpr static bool add_coset = !_cil::value; - - typedef typename dimino_get_coset_elements< - Multiply, - sub_group_elements, - new_coset_rep, - add_coset - >::type coset_elements; - - typedef dimino_add_cosets_for_rep< - Multiply, - Equality, - id, - sub_group_elements, - typename concat<elements, coset_elements>::type, - type_list<gs...>, - rep_element, - sub_group_size - > _helper; - - typedef typename _helper::type type; - constexpr static int global_flags = _cil::global_flags | _helper::global_flags; - - /* Note that we don't have to update global flags here, since - * we will only add these elements if they are not part of - * the group already. But that only happens if the coset rep - * is not already in the group, so the check for the coset rep - * will catch this. - */ -}; - -template< - template<typename, typename> class Multiply, - template<typename, typename> class Equality, - typename id, - typename sub_group_elements, - typename elements - EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty), - typename rep_element, - int sub_group_size -> -struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, rep_element, sub_group_size> -{ - typedef elements type; - constexpr static int global_flags = 0; -}; - -/** \internal - * - * \class dimino_add_all_coset_spaces - * \ingroup CXX11_TensorSymmetry_Module - * - * \brief Recursive template for adding all coset spaces for a new generator - * - * This template tries to go through the list of generators (with - * the help of the dimino_add_cosets_for_rep template) as long as - * it still finds elements that are not part of the group and add - * the corresponding cosets. - * - * \sa enumerate_group_elements, dimino_add_cosets_for_rep - */ -template< - template<typename, typename> class Multiply, - template<typename, typename> class Equality, - typename id, - typename sub_group_elements, - typename elements, - typename generators, - int sub_group_size, - int rep_pos, - bool stop_condition // = false -> -struct dimino_add_all_coset_spaces -{ - typedef typename get<rep_pos, elements>::type rep_element; - typedef dimino_add_cosets_for_rep< - Multiply, - Equality, - id, - sub_group_elements, - elements, - generators, - rep_element, - sub_group_elements::count - > _ac4r; - typedef typename _ac4r::type new_elements; - - constexpr static int new_rep_pos = rep_pos + sub_group_elements::count; - constexpr static bool new_stop_condition = new_rep_pos >= new_elements::count; - - typedef dimino_add_all_coset_spaces< - Multiply, - Equality, - id, - sub_group_elements, - new_elements, - generators, - sub_group_size, - new_rep_pos, - new_stop_condition - > _helper; - - typedef typename _helper::type type; - constexpr static int global_flags = _helper::global_flags | _ac4r::global_flags; -}; - -template< - template<typename, typename> class Multiply, - template<typename, typename> class Equality, - typename id, - typename sub_group_elements, - typename elements, - typename generators, - int sub_group_size, - int rep_pos -> -struct dimino_add_all_coset_spaces<Multiply, Equality, id, sub_group_elements, elements, generators, sub_group_size, rep_pos, true> -{ - typedef elements type; - constexpr static int global_flags = 0; -}; - -/** \internal - * - * \class dimino_add_generator - * \ingroup CXX11_TensorSymmetry_Module - * - * \brief Enlarge the group by adding a new generator. - * - * It accepts a boolean parameter that determines if the generator is redundant, - * i.e. was already seen in the group. In that case, it reduces to a no-op. - * - * \sa enumerate_group_elements, dimino_add_all_coset_spaces - */ -template< - template<typename, typename> class Multiply, - template<typename, typename> class Equality, - typename id, - typename elements, - typename generators_done, - typename current_generator, - bool redundant // = false -> -struct dimino_add_generator -{ - /* this template is only called if the generator is not redundant - * => all elements of the group multiplied with the new generator - * are going to be new elements of the most trivial coset space - */ - typedef typename apply_op_from_right<Multiply, current_generator, elements>::type multiplied_elements; - typedef typename concat<elements, multiplied_elements>::type new_elements; - - constexpr static int rep_pos = elements::count; - - typedef dimino_add_all_coset_spaces< - Multiply, - Equality, - id, - elements, // elements of previous subgroup - new_elements, - typename concat<generators_done, type_list<current_generator>>::type, - elements::count, // size of previous subgroup - rep_pos, - false // don't stop (because rep_pos >= new_elements::count is always false at this point) - > _helper; - typedef typename _helper::type type; - constexpr static int global_flags = _helper::global_flags; -}; - -template< - template<typename, typename> class Multiply, - template<typename, typename> class Equality, - typename id, - typename elements, - typename generators_done, - typename current_generator -> -struct dimino_add_generator<Multiply, Equality, id, elements, generators_done, current_generator, true> -{ - // redundant case - typedef elements type; - constexpr static int global_flags = 0; -}; - -/** \internal - * - * \class dimino_add_remaining_generators - * \ingroup CXX11_TensorSymmetry_Module - * - * \brief Recursive template that adds all remaining generators to a group - * - * Loop through the list of generators that remain and successively - * add them to the group. - * - * \sa enumerate_group_elements, dimino_add_generator - */ -template< - template<typename, typename> class Multiply, - template<typename, typename> class Equality, - typename id, - typename generators_done, - typename remaining_generators, - typename elements -> -struct dimino_add_remaining_generators -{ - typedef typename get<0, remaining_generators>::type first_generator; - typedef typename skip<1, remaining_generators>::type next_generators; - - typedef contained_in_list_gf<Equality, first_generator, elements> _cil; - - typedef dimino_add_generator< - Multiply, - Equality, - id, - elements, - generators_done, - first_generator, - _cil::value - > _helper; - - typedef typename _helper::type new_elements; - - typedef dimino_add_remaining_generators< - Multiply, - Equality, - id, - typename concat<generators_done, type_list<first_generator>>::type, - next_generators, - new_elements - > _next_iter; - - typedef typename _next_iter::type type; - constexpr static int global_flags = - _cil::global_flags | - _helper::global_flags | - _next_iter::global_flags; -}; - -template< - template<typename, typename> class Multiply, - template<typename, typename> class Equality, - typename id, - typename generators_done, - typename elements -> -struct dimino_add_remaining_generators<Multiply, Equality, id, generators_done, type_list<>, elements> -{ - typedef elements type; - constexpr static int global_flags = 0; -}; - -/** \internal - * - * \class enumerate_group_elements_noid - * \ingroup CXX11_TensorSymmetry_Module - * - * \brief Helper template that implements group element enumeration - * - * This is a helper template that implements the actual enumeration - * of group elements. This has been split so that the list of - * generators can be cleansed of the identity element before - * performing the actual operation. - * - * \sa enumerate_group_elements - */ -template< - template<typename, typename> class Multiply, - template<typename, typename> class Equality, - typename id, - typename generators, - int initial_global_flags = 0 -> -struct enumerate_group_elements_noid -{ - typedef dimino_first_step_elements<Multiply, Equality, id, generators> first_step; - typedef typename first_step::type first_step_elements; - - typedef dimino_add_remaining_generators< - Multiply, - Equality, - id, - typename first_step::generators_done, - typename first_step::next_generators, // remaining_generators - typename first_step::type // first_step elements - > _helper; - - typedef typename _helper::type type; - constexpr static int global_flags = - initial_global_flags | - first_step::global_flags | - _helper::global_flags; -}; - -// in case when no generators are specified -template< - template<typename, typename> class Multiply, - template<typename, typename> class Equality, - typename id, - int initial_global_flags -> -struct enumerate_group_elements_noid<Multiply, Equality, id, type_list<>, initial_global_flags> -{ - typedef type_list<id> type; - constexpr static int global_flags = initial_global_flags; -}; - -/** \internal - * - * \class enumerate_group_elements - * \ingroup CXX11_TensorSymmetry_Module - * - * \brief Enumerate all elements in a finite group - * - * This template enumerates all elements in a finite group. It accepts - * the following template parameters: - * - * \tparam Multiply The multiplication operation that multiplies two group elements - * with each other. - * \tparam Equality The equality check operation that checks if two group elements - * are equal to another. - * \tparam id The identity element - * \tparam _generators A list of (possibly redundant) generators of the group - */ -template< - template<typename, typename> class Multiply, - template<typename, typename> class Equality, - typename id, - typename _generators -> -struct enumerate_group_elements - : public enumerate_group_elements_noid< - Multiply, - Equality, - id, - typename strip_identities<Equality, id, _generators>::type, - strip_identities<Equality, id, _generators>::global_flags - > -{ -}; - -} // end namespace group_theory - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H - -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/Barrier.h b/src/EigenUnsupported/CXX11/src/ThreadPool/Barrier.h deleted file mode 100644 index e4c59dc..0000000 --- a/src/EigenUnsupported/CXX11/src/ThreadPool/Barrier.h +++ /dev/null @@ -1,67 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -// Barrier is an object that allows one or more threads to wait until -// Notify has been called a specified number of times. - -#ifndef EIGEN_CXX11_THREADPOOL_BARRIER_H -#define EIGEN_CXX11_THREADPOOL_BARRIER_H - -namespace Eigen { - -class Barrier { - public: - Barrier(unsigned int count) : state_(count << 1), notified_(false) { - eigen_plain_assert(((count << 1) >> 1) == count); - } - ~Barrier() { eigen_plain_assert((state_ >> 1) == 0); } - - void Notify() { - unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2; - if (v != 1) { - // Clear the lowest bit (waiter flag) and check that the original state - // value was not zero. If it was zero, it means that notify was called - // more times than the original count. - eigen_plain_assert(((v + 2) & ~1) != 0); - return; // either count has not dropped to 0, or waiter is not waiting - } - std::unique_lock<std::mutex> l(mu_); - eigen_plain_assert(!notified_); - notified_ = true; - cv_.notify_all(); - } - - void Wait() { - unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel); - if ((v >> 1) == 0) return; - std::unique_lock<std::mutex> l(mu_); - while (!notified_) { - cv_.wait(l); - } - } - - private: - std::mutex mu_; - std::condition_variable cv_; - std::atomic<unsigned int> state_; // low bit is waiter flag - bool notified_; -}; - -// Notification is an object that allows a user to to wait for another -// thread to signal a notification that an event has occurred. -// -// Multiple threads can wait on the same Notification object, -// but only one caller must call Notify() on the object. -struct Notification : Barrier { - Notification() : Barrier(1){}; -}; - -} // namespace Eigen - -#endif // EIGEN_CXX11_THREADPOOL_BARRIER_H diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/EventCount.h b/src/EigenUnsupported/CXX11/src/ThreadPool/EventCount.h deleted file mode 100644 index 4549aa0..0000000 --- a/src/EigenUnsupported/CXX11/src/ThreadPool/EventCount.h +++ /dev/null @@ -1,249 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_ -#define EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_ - -namespace Eigen { - -// EventCount allows to wait for arbitrary predicates in non-blocking -// algorithms. Think of condition variable, but wait predicate does not need to -// be protected by a mutex. Usage: -// Waiting thread does: -// -// if (predicate) -// return act(); -// EventCount::Waiter& w = waiters[my_index]; -// ec.Prewait(&w); -// if (predicate) { -// ec.CancelWait(&w); -// return act(); -// } -// ec.CommitWait(&w); -// -// Notifying thread does: -// -// predicate = true; -// ec.Notify(true); -// -// Notify is cheap if there are no waiting threads. Prewait/CommitWait are not -// cheap, but they are executed only if the preceding predicate check has -// failed. -// -// Algorithm outline: -// There are two main variables: predicate (managed by user) and state_. -// Operation closely resembles Dekker mutual algorithm: -// https://en.wikipedia.org/wiki/Dekker%27s_algorithm -// Waiting thread sets state_ then checks predicate, Notifying thread sets -// predicate then checks state_. Due to seq_cst fences in between these -// operations it is guaranteed than either waiter will see predicate change -// and won't block, or notifying thread will see state_ change and will unblock -// the waiter, or both. But it can't happen that both threads don't see each -// other changes, which would lead to deadlock. -class EventCount { - public: - class Waiter; - - EventCount(MaxSizeVector<Waiter>& waiters) - : state_(kStackMask), waiters_(waiters) { - eigen_plain_assert(waiters.size() < (1 << kWaiterBits) - 1); - } - - ~EventCount() { - // Ensure there are no waiters. - eigen_plain_assert(state_.load() == kStackMask); - } - - // Prewait prepares for waiting. - // After calling Prewait, the thread must re-check the wait predicate - // and then call either CancelWait or CommitWait. - void Prewait() { - uint64_t state = state_.load(std::memory_order_relaxed); - for (;;) { - CheckState(state); - uint64_t newstate = state + kWaiterInc; - CheckState(newstate); - if (state_.compare_exchange_weak(state, newstate, - std::memory_order_seq_cst)) - return; - } - } - - // CommitWait commits waiting after Prewait. - void CommitWait(Waiter* w) { - eigen_plain_assert((w->epoch & ~kEpochMask) == 0); - w->state = Waiter::kNotSignaled; - const uint64_t me = (w - &waiters_[0]) | w->epoch; - uint64_t state = state_.load(std::memory_order_seq_cst); - for (;;) { - CheckState(state, true); - uint64_t newstate; - if ((state & kSignalMask) != 0) { - // Consume the signal and return immidiately. - newstate = state - kWaiterInc - kSignalInc; - } else { - // Remove this thread from pre-wait counter and add to the waiter stack. - newstate = ((state & kWaiterMask) - kWaiterInc) | me; - w->next.store(state & (kStackMask | kEpochMask), - std::memory_order_relaxed); - } - CheckState(newstate); - if (state_.compare_exchange_weak(state, newstate, - std::memory_order_acq_rel)) { - if ((state & kSignalMask) == 0) { - w->epoch += kEpochInc; - Park(w); - } - return; - } - } - } - - // CancelWait cancels effects of the previous Prewait call. - void CancelWait() { - uint64_t state = state_.load(std::memory_order_relaxed); - for (;;) { - CheckState(state, true); - uint64_t newstate = state - kWaiterInc; - // We don't know if the thread was also notified or not, - // so we should not consume a signal unconditionaly. - // Only if number of waiters is equal to number of signals, - // we know that the thread was notified and we must take away the signal. - if (((state & kWaiterMask) >> kWaiterShift) == - ((state & kSignalMask) >> kSignalShift)) - newstate -= kSignalInc; - CheckState(newstate); - if (state_.compare_exchange_weak(state, newstate, - std::memory_order_acq_rel)) - return; - } - } - - // Notify wakes one or all waiting threads. - // Must be called after changing the associated wait predicate. - void Notify(bool notifyAll) { - std::atomic_thread_fence(std::memory_order_seq_cst); - uint64_t state = state_.load(std::memory_order_acquire); - for (;;) { - CheckState(state); - const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift; - const uint64_t signals = (state & kSignalMask) >> kSignalShift; - // Easy case: no waiters. - if ((state & kStackMask) == kStackMask && waiters == signals) return; - uint64_t newstate; - if (notifyAll) { - // Empty wait stack and set signal to number of pre-wait threads. - newstate = - (state & kWaiterMask) | (waiters << kSignalShift) | kStackMask; - } else if (signals < waiters) { - // There is a thread in pre-wait state, unblock it. - newstate = state + kSignalInc; - } else { - // Pop a waiter from list and unpark it. - Waiter* w = &waiters_[state & kStackMask]; - uint64_t next = w->next.load(std::memory_order_relaxed); - newstate = (state & (kWaiterMask | kSignalMask)) | next; - } - CheckState(newstate); - if (state_.compare_exchange_weak(state, newstate, - std::memory_order_acq_rel)) { - if (!notifyAll && (signals < waiters)) - return; // unblocked pre-wait thread - if ((state & kStackMask) == kStackMask) return; - Waiter* w = &waiters_[state & kStackMask]; - if (!notifyAll) w->next.store(kStackMask, std::memory_order_relaxed); - Unpark(w); - return; - } - } - } - - class Waiter { - friend class EventCount; - // Align to 128 byte boundary to prevent false sharing with other Waiter - // objects in the same vector. - EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<uint64_t> next; - std::mutex mu; - std::condition_variable cv; - uint64_t epoch = 0; - unsigned state = kNotSignaled; - enum { - kNotSignaled, - kWaiting, - kSignaled, - }; - }; - - private: - // State_ layout: - // - low kWaiterBits is a stack of waiters committed wait - // (indexes in waiters_ array are used as stack elements, - // kStackMask means empty stack). - // - next kWaiterBits is count of waiters in prewait state. - // - next kWaiterBits is count of pending signals. - // - remaining bits are ABA counter for the stack. - // (stored in Waiter node and incremented on push). - static const uint64_t kWaiterBits = 14; - static const uint64_t kStackMask = (1ull << kWaiterBits) - 1; - static const uint64_t kWaiterShift = kWaiterBits; - static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1) - << kWaiterShift; - static const uint64_t kWaiterInc = 1ull << kWaiterShift; - static const uint64_t kSignalShift = 2 * kWaiterBits; - static const uint64_t kSignalMask = ((1ull << kWaiterBits) - 1) - << kSignalShift; - static const uint64_t kSignalInc = 1ull << kSignalShift; - static const uint64_t kEpochShift = 3 * kWaiterBits; - static const uint64_t kEpochBits = 64 - kEpochShift; - static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift; - static const uint64_t kEpochInc = 1ull << kEpochShift; - std::atomic<uint64_t> state_; - MaxSizeVector<Waiter>& waiters_; - - static void CheckState(uint64_t state, bool waiter = false) { - static_assert(kEpochBits >= 20, "not enough bits to prevent ABA problem"); - const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift; - const uint64_t signals = (state & kSignalMask) >> kSignalShift; - eigen_plain_assert(waiters >= signals); - eigen_plain_assert(waiters < (1 << kWaiterBits) - 1); - eigen_plain_assert(!waiter || waiters > 0); - (void)waiters; - (void)signals; - } - - void Park(Waiter* w) { - std::unique_lock<std::mutex> lock(w->mu); - while (w->state != Waiter::kSignaled) { - w->state = Waiter::kWaiting; - w->cv.wait(lock); - } - } - - void Unpark(Waiter* w) { - for (Waiter* next; w; w = next) { - uint64_t wnext = w->next.load(std::memory_order_relaxed) & kStackMask; - next = wnext == kStackMask ? nullptr : &waiters_[wnext]; - unsigned state; - { - std::unique_lock<std::mutex> lock(w->mu); - state = w->state; - w->state = Waiter::kSignaled; - } - // Avoid notifying if it wasn't waiting. - if (state == Waiter::kWaiting) w->cv.notify_one(); - } - } - - EventCount(const EventCount&) = delete; - void operator=(const EventCount&) = delete; -}; - -} // namespace Eigen - -#endif // EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_ diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/src/EigenUnsupported/CXX11/src/ThreadPool/NonBlockingThreadPool.h deleted file mode 100644 index 23a2b54..0000000 --- a/src/EigenUnsupported/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ /dev/null @@ -1,486 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H -#define EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H - -namespace Eigen { - -template <typename Environment> -class ThreadPoolTempl : public Eigen::ThreadPoolInterface { - public: - typedef typename Environment::Task Task; - typedef RunQueue<Task, 1024> Queue; - - ThreadPoolTempl(int num_threads, Environment env = Environment()) - : ThreadPoolTempl(num_threads, true, env) {} - - ThreadPoolTempl(int num_threads, bool allow_spinning, - Environment env = Environment()) - : env_(env), - num_threads_(num_threads), - allow_spinning_(allow_spinning), - thread_data_(num_threads), - all_coprimes_(num_threads), - waiters_(num_threads), - global_steal_partition_(EncodePartition(0, num_threads_)), - blocked_(0), - spinning_(0), - done_(false), - cancelled_(false), - ec_(waiters_) { - waiters_.resize(num_threads_); - // Calculate coprimes of all numbers [1, num_threads]. - // Coprimes are used for random walks over all threads in Steal - // and NonEmptyQueueIndex. Iteration is based on the fact that if we take - // a random starting thread index t and calculate num_threads - 1 subsequent - // indices as (t + coprime) % num_threads, we will cover all threads without - // repetitions (effectively getting a presudo-random permutation of thread - // indices). - eigen_plain_assert(num_threads_ < kMaxThreads); - for (int i = 1; i <= num_threads_; ++i) { - all_coprimes_.emplace_back(i); - ComputeCoprimes(i, &all_coprimes_.back()); - } -#ifndef EIGEN_THREAD_LOCAL - init_barrier_.reset(new Barrier(num_threads_)); -#endif - thread_data_.resize(num_threads_); - for (int i = 0; i < num_threads_; i++) { - SetStealPartition(i, EncodePartition(0, num_threads_)); - thread_data_[i].thread.reset( - env_.CreateThread([this, i]() { WorkerLoop(i); })); - } -#ifndef EIGEN_THREAD_LOCAL - // Wait for workers to initialize per_thread_map_. Otherwise we might race - // with them in Schedule or CurrentThreadId. - init_barrier_->Wait(); -#endif - } - - ~ThreadPoolTempl() { - done_ = true; - - // Now if all threads block without work, they will start exiting. - // But note that threads can continue to work arbitrary long, - // block, submit new work, unblock and otherwise live full life. - if (!cancelled_) { - ec_.Notify(true); - } else { - // Since we were cancelled, there might be entries in the queues. - // Empty them to prevent their destructor from asserting. - for (size_t i = 0; i < thread_data_.size(); i++) { - thread_data_[i].queue.Flush(); - } - } - // Join threads explicitly (by destroying) to avoid destruction order within - // this class. - for (size_t i = 0; i < thread_data_.size(); ++i) - thread_data_[i].thread.reset(); - } - - void SetStealPartitions(const std::vector<std::pair<unsigned, unsigned>>& partitions) { - eigen_plain_assert(partitions.size() == static_cast<std::size_t>(num_threads_)); - - // Pass this information to each thread queue. - for (int i = 0; i < num_threads_; i++) { - const auto& pair = partitions[i]; - unsigned start = pair.first, end = pair.second; - AssertBounds(start, end); - unsigned val = EncodePartition(start, end); - SetStealPartition(i, val); - } - } - - void Schedule(std::function<void()> fn) EIGEN_OVERRIDE { - ScheduleWithHint(std::move(fn), 0, num_threads_); - } - - void ScheduleWithHint(std::function<void()> fn, int start, - int limit) override { - Task t = env_.CreateTask(std::move(fn)); - PerThread* pt = GetPerThread(); - if (pt->pool == this) { - // Worker thread of this pool, push onto the thread's queue. - Queue& q = thread_data_[pt->thread_id].queue; - t = q.PushFront(std::move(t)); - } else { - // A free-standing thread (or worker of another pool), push onto a random - // queue. - eigen_plain_assert(start < limit); - eigen_plain_assert(limit <= num_threads_); - int num_queues = limit - start; - int rnd = Rand(&pt->rand) % num_queues; - eigen_plain_assert(start + rnd < limit); - Queue& q = thread_data_[start + rnd].queue; - t = q.PushBack(std::move(t)); - } - // Note: below we touch this after making w available to worker threads. - // Strictly speaking, this can lead to a racy-use-after-free. Consider that - // Schedule is called from a thread that is neither main thread nor a worker - // thread of this pool. Then, execution of w directly or indirectly - // completes overall computations, which in turn leads to destruction of - // this. We expect that such scenario is prevented by program, that is, - // this is kept alive while any threads can potentially be in Schedule. - if (!t.f) { - ec_.Notify(false); - } else { - env_.ExecuteTask(t); // Push failed, execute directly. - } - } - - void Cancel() EIGEN_OVERRIDE { - cancelled_ = true; - done_ = true; - - // Let each thread know it's been cancelled. -#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION - for (size_t i = 0; i < thread_data_.size(); i++) { - thread_data_[i].thread->OnCancel(); - } -#endif - - // Wake up the threads without work to let them exit on their own. - ec_.Notify(true); - } - - int NumThreads() const EIGEN_FINAL { return num_threads_; } - - int CurrentThreadId() const EIGEN_FINAL { - const PerThread* pt = const_cast<ThreadPoolTempl*>(this)->GetPerThread(); - if (pt->pool == this) { - return pt->thread_id; - } else { - return -1; - } - } - - private: - // Create a single atomic<int> that encodes start and limit information for - // each thread. - // We expect num_threads_ < 65536, so we can store them in a single - // std::atomic<unsigned>. - // Exposed publicly as static functions so that external callers can reuse - // this encode/decode logic for maintaining their own thread-safe copies of - // scheduling and steal domain(s). - static const int kMaxPartitionBits = 16; - static const int kMaxThreads = 1 << kMaxPartitionBits; - - inline unsigned EncodePartition(unsigned start, unsigned limit) { - return (start << kMaxPartitionBits) | limit; - } - - inline void DecodePartition(unsigned val, unsigned* start, unsigned* limit) { - *limit = val & (kMaxThreads - 1); - val >>= kMaxPartitionBits; - *start = val; - } - - void AssertBounds(int start, int end) { - eigen_plain_assert(start >= 0); - eigen_plain_assert(start < end); // non-zero sized partition - eigen_plain_assert(end <= num_threads_); - } - - inline void SetStealPartition(size_t i, unsigned val) { - thread_data_[i].steal_partition.store(val, std::memory_order_relaxed); - } - - inline unsigned GetStealPartition(int i) { - return thread_data_[i].steal_partition.load(std::memory_order_relaxed); - } - - void ComputeCoprimes(int N, MaxSizeVector<unsigned>* coprimes) { - for (int i = 1; i <= N; i++) { - unsigned a = i; - unsigned b = N; - // If GCD(a, b) == 1, then a and b are coprimes. - while (b != 0) { - unsigned tmp = a; - a = b; - b = tmp % b; - } - if (a == 1) { - coprimes->push_back(i); - } - } - } - - typedef typename Environment::EnvThread Thread; - - struct PerThread { - constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) {} - ThreadPoolTempl* pool; // Parent pool, or null for normal threads. - uint64_t rand; // Random generator state. - int thread_id; // Worker thread index in pool. -#ifndef EIGEN_THREAD_LOCAL - // Prevent false sharing. - char pad_[128]; -#endif - }; - - struct ThreadData { - constexpr ThreadData() : thread(), steal_partition(0), queue() {} - std::unique_ptr<Thread> thread; - std::atomic<unsigned> steal_partition; - Queue queue; - }; - - Environment env_; - const int num_threads_; - const bool allow_spinning_; - MaxSizeVector<ThreadData> thread_data_; - MaxSizeVector<MaxSizeVector<unsigned>> all_coprimes_; - MaxSizeVector<EventCount::Waiter> waiters_; - unsigned global_steal_partition_; - std::atomic<unsigned> blocked_; - std::atomic<bool> spinning_; - std::atomic<bool> done_; - std::atomic<bool> cancelled_; - EventCount ec_; -#ifndef EIGEN_THREAD_LOCAL - std::unique_ptr<Barrier> init_barrier_; - std::mutex per_thread_map_mutex_; // Protects per_thread_map_. - std::unordered_map<uint64_t, std::unique_ptr<PerThread>> per_thread_map_; -#endif - - // Main worker thread loop. - void WorkerLoop(int thread_id) { -#ifndef EIGEN_THREAD_LOCAL - std::unique_ptr<PerThread> new_pt(new PerThread()); - per_thread_map_mutex_.lock(); - bool insertOK = per_thread_map_.emplace(GlobalThreadIdHash(), std::move(new_pt)).second; - eigen_plain_assert(insertOK); - EIGEN_UNUSED_VARIABLE(insertOK); - per_thread_map_mutex_.unlock(); - init_barrier_->Notify(); - init_barrier_->Wait(); -#endif - PerThread* pt = GetPerThread(); - pt->pool = this; - pt->rand = GlobalThreadIdHash(); - pt->thread_id = thread_id; - Queue& q = thread_data_[thread_id].queue; - EventCount::Waiter* waiter = &waiters_[thread_id]; - // TODO(dvyukov,rmlarsen): The time spent in NonEmptyQueueIndex() is - // proportional to num_threads_ and we assume that new work is scheduled at - // a constant rate, so we set spin_count to 5000 / num_threads_. The - // constant was picked based on a fair dice roll, tune it. - const int spin_count = - allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0; - if (num_threads_ == 1) { - // For num_threads_ == 1 there is no point in going through the expensive - // steal loop. Moreover, since NonEmptyQueueIndex() calls PopBack() on the - // victim queues it might reverse the order in which ops are executed - // compared to the order in which they are scheduled, which tends to be - // counter-productive for the types of I/O workloads the single thread - // pools tend to be used for. - while (!cancelled_) { - Task t = q.PopFront(); - for (int i = 0; i < spin_count && !t.f; i++) { - if (!cancelled_.load(std::memory_order_relaxed)) { - t = q.PopFront(); - } - } - if (!t.f) { - if (!WaitForWork(waiter, &t)) { - return; - } - } - if (t.f) { - env_.ExecuteTask(t); - } - } - } else { - while (!cancelled_) { - Task t = q.PopFront(); - if (!t.f) { - t = LocalSteal(); - if (!t.f) { - t = GlobalSteal(); - if (!t.f) { - // Leave one thread spinning. This reduces latency. - if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) { - for (int i = 0; i < spin_count && !t.f; i++) { - if (!cancelled_.load(std::memory_order_relaxed)) { - t = GlobalSteal(); - } else { - return; - } - } - spinning_ = false; - } - if (!t.f) { - if (!WaitForWork(waiter, &t)) { - return; - } - } - } - } - } - if (t.f) { - env_.ExecuteTask(t); - } - } - } - } - - // Steal tries to steal work from other worker threads in the range [start, - // limit) in best-effort manner. - Task Steal(unsigned start, unsigned limit) { - PerThread* pt = GetPerThread(); - const size_t size = limit - start; - unsigned r = Rand(&pt->rand); - // Reduce r into [0, size) range, this utilizes trick from - // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ - eigen_plain_assert(all_coprimes_[size - 1].size() < (1<<30)); - unsigned victim = ((uint64_t)r * (uint64_t)size) >> 32; - unsigned index = ((uint64_t) all_coprimes_[size - 1].size() * (uint64_t)r) >> 32; - unsigned inc = all_coprimes_[size - 1][index]; - - for (unsigned i = 0; i < size; i++) { - eigen_plain_assert(start + victim < limit); - Task t = thread_data_[start + victim].queue.PopBack(); - if (t.f) { - return t; - } - victim += inc; - if (victim >= size) { - victim -= size; - } - } - return Task(); - } - - // Steals work within threads belonging to the partition. - Task LocalSteal() { - PerThread* pt = GetPerThread(); - unsigned partition = GetStealPartition(pt->thread_id); - // If thread steal partition is the same as global partition, there is no - // need to go through the steal loop twice. - if (global_steal_partition_ == partition) return Task(); - unsigned start, limit; - DecodePartition(partition, &start, &limit); - AssertBounds(start, limit); - - return Steal(start, limit); - } - - // Steals work from any other thread in the pool. - Task GlobalSteal() { - return Steal(0, num_threads_); - } - - - // WaitForWork blocks until new work is available (returns true), or if it is - // time to exit (returns false). Can optionally return a task to execute in t - // (in such case t.f != nullptr on return). - bool WaitForWork(EventCount::Waiter* waiter, Task* t) { - eigen_plain_assert(!t->f); - // We already did best-effort emptiness check in Steal, so prepare for - // blocking. - ec_.Prewait(); - // Now do a reliable emptiness check. - int victim = NonEmptyQueueIndex(); - if (victim != -1) { - ec_.CancelWait(); - if (cancelled_) { - return false; - } else { - *t = thread_data_[victim].queue.PopBack(); - return true; - } - } - // Number of blocked threads is used as termination condition. - // If we are shutting down and all worker threads blocked without work, - // that's we are done. - blocked_++; - // TODO is blocked_ required to be unsigned? - if (done_ && blocked_ == static_cast<unsigned>(num_threads_)) { - ec_.CancelWait(); - // Almost done, but need to re-check queues. - // Consider that all queues are empty and all worker threads are preempted - // right after incrementing blocked_ above. Now a free-standing thread - // submits work and calls destructor (which sets done_). If we don't - // re-check queues, we will exit leaving the work unexecuted. - if (NonEmptyQueueIndex() != -1) { - // Note: we must not pop from queues before we decrement blocked_, - // otherwise the following scenario is possible. Consider that instead - // of checking for emptiness we popped the only element from queues. - // Now other worker threads can start exiting, which is bad if the - // work item submits other work. So we just check emptiness here, - // which ensures that all worker threads exit at the same time. - blocked_--; - return true; - } - // Reached stable termination state. - ec_.Notify(true); - return false; - } - ec_.CommitWait(waiter); - blocked_--; - return true; - } - - int NonEmptyQueueIndex() { - PerThread* pt = GetPerThread(); - // We intentionally design NonEmptyQueueIndex to steal work from - // anywhere in the queue so threads don't block in WaitForWork() forever - // when all threads in their partition go to sleep. Steal is still local. - const size_t size = thread_data_.size(); - unsigned r = Rand(&pt->rand); - unsigned inc = all_coprimes_[size - 1][r % all_coprimes_[size - 1].size()]; - unsigned victim = r % size; - for (unsigned i = 0; i < size; i++) { - if (!thread_data_[victim].queue.Empty()) { - return victim; - } - victim += inc; - if (victim >= size) { - victim -= size; - } - } - return -1; - } - - static EIGEN_STRONG_INLINE uint64_t GlobalThreadIdHash() { - return std::hash<std::thread::id>()(std::this_thread::get_id()); - } - - EIGEN_STRONG_INLINE PerThread* GetPerThread() { -#ifndef EIGEN_THREAD_LOCAL - static PerThread dummy; - auto it = per_thread_map_.find(GlobalThreadIdHash()); - if (it == per_thread_map_.end()) { - return &dummy; - } else { - return it->second.get(); - } -#else - EIGEN_THREAD_LOCAL PerThread per_thread_; - PerThread* pt = &per_thread_; - return pt; -#endif - } - - static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) { - uint64_t current = *state; - // Update the internal state - *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; - // Generate the random output (using the PCG-XSH-RS scheme) - return static_cast<unsigned>((current ^ (current >> 22)) >> - (22 + (current >> 61))); - } -}; - -typedef ThreadPoolTempl<StlThreadEnvironment> ThreadPool; - -} // namespace Eigen - -#endif // EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/RunQueue.h b/src/EigenUnsupported/CXX11/src/ThreadPool/RunQueue.h deleted file mode 100644 index b572ebc..0000000 --- a/src/EigenUnsupported/CXX11/src/ThreadPool/RunQueue.h +++ /dev/null @@ -1,236 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_ -#define EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_ - -namespace Eigen { - -// RunQueue is a fixed-size, partially non-blocking deque or Work items. -// Operations on front of the queue must be done by a single thread (owner), -// operations on back of the queue can be done by multiple threads concurrently. -// -// Algorithm outline: -// All remote threads operating on the queue back are serialized by a mutex. -// This ensures that at most two threads access state: owner and one remote -// thread (Size aside). The algorithm ensures that the occupied region of the -// underlying array is logically continuous (can wraparound, but no stray -// occupied elements). Owner operates on one end of this region, remote thread -// operates on the other end. Synchronization between these threads -// (potential consumption of the last element and take up of the last empty -// element) happens by means of state variable in each element. States are: -// empty, busy (in process of insertion of removal) and ready. Threads claim -// elements (empty->busy and ready->busy transitions) by means of a CAS -// operation. The finishing transition (busy->empty and busy->ready) are done -// with plain store as the element is exclusively owned by the current thread. -// -// Note: we could permit only pointers as elements, then we would not need -// separate state variable as null/non-null pointer value would serve as state, -// but that would require malloc/free per operation for large, complex values -// (and this is designed to store std::function<()>). -template <typename Work, unsigned kSize> -class RunQueue { - public: - RunQueue() : front_(0), back_(0) { - // require power-of-two for fast masking - eigen_plain_assert((kSize & (kSize - 1)) == 0); - eigen_plain_assert(kSize > 2); // why would you do this? - eigen_plain_assert(kSize <= (64 << 10)); // leave enough space for counter - for (unsigned i = 0; i < kSize; i++) - array_[i].state.store(kEmpty, std::memory_order_relaxed); - } - - ~RunQueue() { eigen_plain_assert(Size() == 0); } - - // PushFront inserts w at the beginning of the queue. - // If queue is full returns w, otherwise returns default-constructed Work. - Work PushFront(Work w) { - unsigned front = front_.load(std::memory_order_relaxed); - Elem* e = &array_[front & kMask]; - uint8_t s = e->state.load(std::memory_order_relaxed); - if (s != kEmpty || - !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) - return w; - front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed); - e->w = std::move(w); - e->state.store(kReady, std::memory_order_release); - return Work(); - } - - // PopFront removes and returns the first element in the queue. - // If the queue was empty returns default-constructed Work. - Work PopFront() { - unsigned front = front_.load(std::memory_order_relaxed); - Elem* e = &array_[(front - 1) & kMask]; - uint8_t s = e->state.load(std::memory_order_relaxed); - if (s != kReady || - !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) - return Work(); - Work w = std::move(e->w); - e->state.store(kEmpty, std::memory_order_release); - front = ((front - 1) & kMask2) | (front & ~kMask2); - front_.store(front, std::memory_order_relaxed); - return w; - } - - // PushBack adds w at the end of the queue. - // If queue is full returns w, otherwise returns default-constructed Work. - Work PushBack(Work w) { - std::unique_lock<std::mutex> lock(mutex_); - unsigned back = back_.load(std::memory_order_relaxed); - Elem* e = &array_[(back - 1) & kMask]; - uint8_t s = e->state.load(std::memory_order_relaxed); - if (s != kEmpty || - !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) - return w; - back = ((back - 1) & kMask2) | (back & ~kMask2); - back_.store(back, std::memory_order_relaxed); - e->w = std::move(w); - e->state.store(kReady, std::memory_order_release); - return Work(); - } - - // PopBack removes and returns the last elements in the queue. - Work PopBack() { - if (Empty()) return Work(); - std::unique_lock<std::mutex> lock(mutex_); - unsigned back = back_.load(std::memory_order_relaxed); - Elem* e = &array_[back & kMask]; - uint8_t s = e->state.load(std::memory_order_relaxed); - if (s != kReady || - !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) - return Work(); - Work w = std::move(e->w); - e->state.store(kEmpty, std::memory_order_release); - back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed); - return w; - } - - // PopBackHalf removes and returns half last elements in the queue. - // Returns number of elements removed. - unsigned PopBackHalf(std::vector<Work>* result) { - if (Empty()) return 0; - std::unique_lock<std::mutex> lock(mutex_); - unsigned back = back_.load(std::memory_order_relaxed); - unsigned size = Size(); - unsigned mid = back; - if (size > 1) mid = back + (size - 1) / 2; - unsigned n = 0; - unsigned start = 0; - for (; static_cast<int>(mid - back) >= 0; mid--) { - Elem* e = &array_[mid & kMask]; - uint8_t s = e->state.load(std::memory_order_relaxed); - if (n == 0) { - if (s != kReady || !e->state.compare_exchange_strong( - s, kBusy, std::memory_order_acquire)) - continue; - start = mid; - } else { - // Note: no need to store temporal kBusy, we exclusively own these - // elements. - eigen_plain_assert(s == kReady); - } - result->push_back(std::move(e->w)); - e->state.store(kEmpty, std::memory_order_release); - n++; - } - if (n != 0) - back_.store(start + 1 + (kSize << 1), std::memory_order_relaxed); - return n; - } - - // Size returns current queue size. - // Can be called by any thread at any time. - unsigned Size() const { return SizeOrNotEmpty<true>(); } - - // Empty tests whether container is empty. - // Can be called by any thread at any time. - bool Empty() const { return SizeOrNotEmpty<false>() == 0; } - - // Delete all the elements from the queue. - void Flush() { - while (!Empty()) { - PopFront(); - } - } - - private: - static const unsigned kMask = kSize - 1; - static const unsigned kMask2 = (kSize << 1) - 1; - struct Elem { - std::atomic<uint8_t> state; - Work w; - }; - enum { - kEmpty, - kBusy, - kReady, - }; - std::mutex mutex_; - // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of - // front/back, respectively. The remaining bits contain modification counters - // that are incremented on Push operations. This allows us to (1) distinguish - // between empty and full conditions (if we would use log(kSize) bits for - // position, these conditions would be indistinguishable); (2) obtain - // consistent snapshot of front_/back_ for Size operation using the - // modification counters. - std::atomic<unsigned> front_; - std::atomic<unsigned> back_; - Elem array_[kSize]; - - // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false, - // only whether the size is 0 is guaranteed to be correct. - // Can be called by any thread at any time. - template<bool NeedSizeEstimate> - unsigned SizeOrNotEmpty() const { - // Emptiness plays critical role in thread pool blocking. So we go to great - // effort to not produce false positives (claim non-empty queue as empty). - unsigned front = front_.load(std::memory_order_acquire); - for (;;) { - // Capture a consistent snapshot of front/tail. - unsigned back = back_.load(std::memory_order_acquire); - unsigned front1 = front_.load(std::memory_order_relaxed); - if (front != front1) { - front = front1; - std::atomic_thread_fence(std::memory_order_acquire); - continue; - } - if (NeedSizeEstimate) { - return CalculateSize(front, back); - } else { - // This value will be 0 if the queue is empty, and undefined otherwise. - unsigned maybe_zero = ((front ^ back) & kMask2); - // Queue size estimate must agree with maybe zero check on the queue - // empty/non-empty state. - eigen_assert((CalculateSize(front, back) == 0) == (maybe_zero == 0)); - return maybe_zero; - } - } - } - - EIGEN_ALWAYS_INLINE - unsigned CalculateSize(unsigned front, unsigned back) const { - int size = (front & kMask2) - (back & kMask2); - // Fix overflow. - if (size < 0) size += 2 * kSize; - // Order of modification in push/pop is crafted to make the queue look - // larger than it is during concurrent modifications. E.g. push can - // increment size before the corresponding pop has decremented it. - // So the computed size can be up to kSize + 1, fix it. - if (size > static_cast<int>(kSize)) size = kSize; - return static_cast<unsigned>(size); - } - - RunQueue(const RunQueue&) = delete; - void operator=(const RunQueue&) = delete; -}; - -} // namespace Eigen - -#endif // EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_ diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadCancel.h b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadCancel.h deleted file mode 100644 index a05685f..0000000 --- a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadCancel.h +++ /dev/null @@ -1,23 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H -#define EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H - -// Try to come up with a portable way to cancel a thread -#if EIGEN_OS_GNULINUX - #define EIGEN_THREAD_CANCEL(t) \ - pthread_cancel(t.native_handle()); - #define EIGEN_SUPPORTS_THREAD_CANCELLATION 1 -#else -#define EIGEN_THREAD_CANCEL(t) -#endif - - -#endif // EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadEnvironment.h b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadEnvironment.h deleted file mode 100644 index d94a064..0000000 --- a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadEnvironment.h +++ /dev/null @@ -1,40 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H -#define EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H - -namespace Eigen { - -struct StlThreadEnvironment { - struct Task { - std::function<void()> f; - }; - - // EnvThread constructor must start the thread, - // destructor must join the thread. - class EnvThread { - public: - EnvThread(std::function<void()> f) : thr_(std::move(f)) {} - ~EnvThread() { thr_.join(); } - // This function is called when the threadpool is cancelled. - void OnCancel() { } - - private: - std::thread thr_; - }; - - EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(std::move(f)); } - Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; } - void ExecuteTask(const Task& t) { t.f(); } -}; - -} // namespace Eigen - -#endif // EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadLocal.h b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadLocal.h deleted file mode 100644 index 4e68474..0000000 --- a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadLocal.h +++ /dev/null @@ -1,301 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H -#define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H - -#ifdef EIGEN_AVOID_THREAD_LOCAL - -#ifdef EIGEN_THREAD_LOCAL -#undef EIGEN_THREAD_LOCAL -#endif - -#else - -#if EIGEN_MAX_CPP_VER >= 11 && \ - ((EIGEN_COMP_GNUC && EIGEN_GNUC_AT_LEAST(4, 8)) || \ - __has_feature(cxx_thread_local) || \ - (EIGEN_COMP_MSVC >= 1900) ) -#define EIGEN_THREAD_LOCAL static thread_local -#endif - -// Disable TLS for Apple and Android builds with older toolchains. -#if defined(__APPLE__) -// Included for TARGET_OS_IPHONE, __IPHONE_OS_VERSION_MIN_REQUIRED, -// __IPHONE_8_0. -#include <Availability.h> -#include <TargetConditionals.h> -#endif -// Checks whether C++11's `thread_local` storage duration specifier is -// supported. -#if defined(__apple_build_version__) && \ - ((__apple_build_version__ < 8000042) || \ - (TARGET_OS_IPHONE && __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0)) -// Notes: Xcode's clang did not support `thread_local` until version -// 8, and even then not for all iOS < 9.0. -#undef EIGEN_THREAD_LOCAL - -#elif defined(__ANDROID__) && EIGEN_COMP_CLANG -// There are platforms for which TLS should not be used even though the compiler -// makes it seem like it's supported (Android NDK < r12b for example). -// This is primarily because of linker problems and toolchain misconfiguration: -// TLS isn't supported until NDK r12b per -// https://developer.android.com/ndk/downloads/revision_history.html -// Since NDK r16, `__NDK_MAJOR__` and `__NDK_MINOR__` are defined in -// <android/ndk-version.h>. For NDK < r16, users should define these macros, -// e.g. `-D__NDK_MAJOR__=11 -D__NKD_MINOR__=0` for NDK r11. -#if __has_include(<android/ndk-version.h>) -#include <android/ndk-version.h> -#endif // __has_include(<android/ndk-version.h>) -#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \ - defined(__NDK_MINOR__) && \ - ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1))) -#undef EIGEN_THREAD_LOCAL -#endif -#endif // defined(__ANDROID__) && defined(__clang__) - -#endif // EIGEN_AVOID_THREAD_LOCAL - -namespace Eigen { - -namespace internal { -template <typename T> -struct ThreadLocalNoOpInitialize { - void operator()(T&) const {} -}; - -template <typename T> -struct ThreadLocalNoOpRelease { - void operator()(T&) const {} -}; - -} // namespace internal - -// Thread local container for elements of type T, that does not use thread local -// storage. As long as the number of unique threads accessing this storage -// is smaller than `capacity_`, it is lock-free and wait-free. Otherwise it will -// use a mutex for synchronization. -// -// Type `T` has to be default constructible, and by default each thread will get -// a default constructed value. It is possible to specify custom `initialize` -// callable, that will be called lazily from each thread accessing this object, -// and will be passed a default initialized object of type `T`. Also it's -// possible to pass a custom `release` callable, that will be invoked before -// calling ~T(). -// -// Example: -// -// struct Counter { -// int value = 0; -// } -// -// Eigen::ThreadLocal<Counter> counter(10); -// -// // Each thread will have access to it's own counter object. -// Counter& cnt = counter.local(); -// cnt++; -// -// WARNING: Eigen::ThreadLocal uses the OS-specific value returned by -// std::this_thread::get_id() to identify threads. This value is not guaranteed -// to be unique except for the life of the thread. A newly created thread may -// get an OS-specific ID equal to that of an already destroyed thread. -// -// Somewhat similar to TBB thread local storage, with similar restrictions: -// https://www.threadingbuildingblocks.org/docs/help/reference/thread_local_storage/enumerable_thread_specific_cls.html -// -template <typename T, - typename Initialize = internal::ThreadLocalNoOpInitialize<T>, - typename Release = internal::ThreadLocalNoOpRelease<T>> -class ThreadLocal { - // We preallocate default constructed elements in MaxSizedVector. - static_assert(std::is_default_constructible<T>::value, - "ThreadLocal data type must be default constructible"); - - public: - explicit ThreadLocal(int capacity) - : ThreadLocal(capacity, internal::ThreadLocalNoOpInitialize<T>(), - internal::ThreadLocalNoOpRelease<T>()) {} - - ThreadLocal(int capacity, Initialize initialize) - : ThreadLocal(capacity, std::move(initialize), - internal::ThreadLocalNoOpRelease<T>()) {} - - ThreadLocal(int capacity, Initialize initialize, Release release) - : initialize_(std::move(initialize)), - release_(std::move(release)), - capacity_(capacity), - data_(capacity_), - ptr_(capacity_), - filled_records_(0) { - eigen_assert(capacity_ >= 0); - data_.resize(capacity_); - for (int i = 0; i < capacity_; ++i) { - ptr_.emplace_back(nullptr); - } - } - - T& local() { - std::thread::id this_thread = std::this_thread::get_id(); - if (capacity_ == 0) return SpilledLocal(this_thread); - - std::size_t h = std::hash<std::thread::id>()(this_thread); - const int start_idx = h % capacity_; - - // NOTE: From the definition of `std::this_thread::get_id()` it is - // guaranteed that we never can have concurrent insertions with the same key - // to our hash-map like data structure. If we didn't find an element during - // the initial traversal, it's guaranteed that no one else could have - // inserted it while we are in this function. This allows to massively - // simplify out lock-free insert-only hash map. - - // Check if we already have an element for `this_thread`. - int idx = start_idx; - while (ptr_[idx].load() != nullptr) { - ThreadIdAndValue& record = *(ptr_[idx].load()); - if (record.thread_id == this_thread) return record.value; - - idx += 1; - if (idx >= capacity_) idx -= capacity_; - if (idx == start_idx) break; - } - - // If we are here, it means that we found an insertion point in lookup - // table at `idx`, or we did a full traversal and table is full. - - // If lock-free storage is full, fallback on mutex. - if (filled_records_.load() >= capacity_) return SpilledLocal(this_thread); - - // We double check that we still have space to insert an element into a lock - // free storage. If old value in `filled_records_` is larger than the - // records capacity, it means that some other thread added an element while - // we were traversing lookup table. - int insertion_index = - filled_records_.fetch_add(1, std::memory_order_relaxed); - if (insertion_index >= capacity_) return SpilledLocal(this_thread); - - // At this point it's guaranteed that we can access to - // data_[insertion_index_] without a data race. - data_[insertion_index].thread_id = this_thread; - initialize_(data_[insertion_index].value); - - // That's the pointer we'll put into the lookup table. - ThreadIdAndValue* inserted = &data_[insertion_index]; - - // We'll use nullptr pointer to ThreadIdAndValue in a compare-and-swap loop. - ThreadIdAndValue* empty = nullptr; - - // Now we have to find an insertion point into the lookup table. We start - // from the `idx` that was identified as an insertion point above, it's - // guaranteed that we will have an empty record somewhere in a lookup table - // (because we created a record in the `data_`). - const int insertion_idx = idx; - - do { - // Always start search from the original insertion candidate. - idx = insertion_idx; - while (ptr_[idx].load() != nullptr) { - idx += 1; - if (idx >= capacity_) idx -= capacity_; - // If we did a full loop, it means that we don't have any free entries - // in the lookup table, and this means that something is terribly wrong. - eigen_assert(idx != insertion_idx); - } - // Atomic CAS of the pointer guarantees that any other thread, that will - // follow this pointer will see all the mutations in the `data_`. - } while (!ptr_[idx].compare_exchange_weak(empty, inserted)); - - return inserted->value; - } - - // WARN: It's not thread safe to call it concurrently with `local()`. - void ForEach(std::function<void(std::thread::id, T&)> f) { - // Reading directly from `data_` is unsafe, because only CAS to the - // record in `ptr_` makes all changes visible to other threads. - for (auto& ptr : ptr_) { - ThreadIdAndValue* record = ptr.load(); - if (record == nullptr) continue; - f(record->thread_id, record->value); - } - - // We did not spill into the map based storage. - if (filled_records_.load(std::memory_order_relaxed) < capacity_) return; - - // Adds a happens before edge from the last call to SpilledLocal(). - std::unique_lock<std::mutex> lock(mu_); - for (auto& kv : per_thread_map_) { - f(kv.first, kv.second); - } - } - - // WARN: It's not thread safe to call it concurrently with `local()`. - ~ThreadLocal() { - // Reading directly from `data_` is unsafe, because only CAS to the record - // in `ptr_` makes all changes visible to other threads. - for (auto& ptr : ptr_) { - ThreadIdAndValue* record = ptr.load(); - if (record == nullptr) continue; - release_(record->value); - } - - // We did not spill into the map based storage. - if (filled_records_.load(std::memory_order_relaxed) < capacity_) return; - - // Adds a happens before edge from the last call to SpilledLocal(). - std::unique_lock<std::mutex> lock(mu_); - for (auto& kv : per_thread_map_) { - release_(kv.second); - } - } - - private: - struct ThreadIdAndValue { - std::thread::id thread_id; - T value; - }; - - // Use unordered map guarded by a mutex when lock free storage is full. - T& SpilledLocal(std::thread::id this_thread) { - std::unique_lock<std::mutex> lock(mu_); - - auto it = per_thread_map_.find(this_thread); - if (it == per_thread_map_.end()) { - auto result = per_thread_map_.emplace(this_thread, T()); - eigen_assert(result.second); - initialize_((*result.first).second); - return (*result.first).second; - } else { - return it->second; - } - } - - Initialize initialize_; - Release release_; - const int capacity_; - - // Storage that backs lock-free lookup table `ptr_`. Records stored in this - // storage contiguously starting from index 0. - MaxSizeVector<ThreadIdAndValue> data_; - - // Atomic pointers to the data stored in `data_`. Used as a lookup table for - // linear probing hash map (https://en.wikipedia.org/wiki/Linear_probing). - MaxSizeVector<std::atomic<ThreadIdAndValue*>> ptr_; - - // Number of records stored in the `data_`. - std::atomic<int> filled_records_; - - // We fallback on per thread map if lock-free storage is full. In practice - // this should never happen, if `capacity_` is a reasonable estimate of the - // number of threads running in a system. - std::mutex mu_; // Protects per_thread_map_. - std::unordered_map<std::thread::id, T> per_thread_map_; -}; - -} // namespace Eigen - -#endif // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadPoolInterface.h b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadPoolInterface.h deleted file mode 100644 index 25030dc..0000000 --- a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadPoolInterface.h +++ /dev/null @@ -1,48 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H -#define EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H - -namespace Eigen { - -// This defines an interface that ThreadPoolDevice can take to use -// custom thread pools underneath. -class ThreadPoolInterface { - public: - // Submits a closure to be run by a thread in the pool. - virtual void Schedule(std::function<void()> fn) = 0; - - // Submits a closure to be run by threads in the range [start, end) in the - // pool. - virtual void ScheduleWithHint(std::function<void()> fn, int /*start*/, - int /*end*/) { - // Just defer to Schedule in case sub-classes aren't interested in - // overriding this functionality. - Schedule(fn); - } - - // If implemented, stop processing the closures that have been enqueued. - // Currently running closures may still be processed. - // If not implemented, does nothing. - virtual void Cancel() {} - - // Returns the number of threads in the pool. - virtual int NumThreads() const = 0; - - // Returns a logical thread index between 0 and NumThreads() - 1 if called - // from one of the threads in the pool. Returns -1 otherwise. - virtual int CurrentThreadId() const = 0; - - virtual ~ThreadPoolInterface() {} -}; - -} // namespace Eigen - -#endif // EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadYield.h b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadYield.h deleted file mode 100644 index a859c7b..0000000 --- a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadYield.h +++ /dev/null @@ -1,20 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H -#define EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H - -// Try to come up with a portable way to yield -#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7) -#define EIGEN_THREAD_YIELD() sched_yield() -#else -#define EIGEN_THREAD_YIELD() std::this_thread::yield() -#endif - -#endif // EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H diff --git a/src/EigenUnsupported/CXX11/src/util/CXX11Meta.h b/src/EigenUnsupported/CXX11/src/util/CXX11Meta.h deleted file mode 100644 index 149ceaf..0000000 --- a/src/EigenUnsupported/CXX11/src/util/CXX11Meta.h +++ /dev/null @@ -1,537 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11META_H -#define EIGEN_CXX11META_H - -#include <vector> -#include "EmulateArray.h" - -#include "CXX11Workarounds.h" - -namespace Eigen { - -namespace internal { - -/** \internal - * \file CXX11/util/CXX11Meta.h - * This file contains generic metaprogramming classes which are not specifically related to Eigen. - * This file expands upon Core/util/Meta.h and adds support for C++11 specific features. - */ - -template<typename... tt> -struct type_list { constexpr static int count = sizeof...(tt); }; - -template<typename t, typename... tt> -struct type_list<t, tt...> { constexpr static int count = sizeof...(tt) + 1; typedef t first_type; }; - -template<typename T, T... nn> -struct numeric_list { constexpr static std::size_t count = sizeof...(nn); }; - -template<typename T, T n, T... nn> -struct numeric_list<T, n, nn...> { static const std::size_t count = sizeof...(nn) + 1; const static T first_value = n; }; - -#ifndef EIGEN_PARSED_BY_DOXYGEN -/* numeric list constructors - * - * equivalencies: - * constructor result - * typename gen_numeric_list<int, 5>::type numeric_list<int, 0,1,2,3,4> - * typename gen_numeric_list_reversed<int, 5>::type numeric_list<int, 4,3,2,1,0> - * typename gen_numeric_list_swapped_pair<int, 5,1,2>::type numeric_list<int, 0,2,1,3,4> - * typename gen_numeric_list_repeated<int, 0, 5>::type numeric_list<int, 0,0,0,0,0> - */ - -template<typename T, std::size_t n, T start = 0, T... ii> struct gen_numeric_list : gen_numeric_list<T, n-1, start, start + n-1, ii...> {}; -template<typename T, T start, T... ii> struct gen_numeric_list<T, 0, start, ii...> { typedef numeric_list<T, ii...> type; }; - -template<typename T, std::size_t n, T start = 0, T... ii> struct gen_numeric_list_reversed : gen_numeric_list_reversed<T, n-1, start, ii..., start + n-1> {}; -template<typename T, T start, T... ii> struct gen_numeric_list_reversed<T, 0, start, ii...> { typedef numeric_list<T, ii...> type; }; - -template<typename T, std::size_t n, T a, T b, T start = 0, T... ii> struct gen_numeric_list_swapped_pair : gen_numeric_list_swapped_pair<T, n-1, a, b, start, (start + n-1) == a ? b : ((start + n-1) == b ? a : (start + n-1)), ii...> {}; -template<typename T, T a, T b, T start, T... ii> struct gen_numeric_list_swapped_pair<T, 0, a, b, start, ii...> { typedef numeric_list<T, ii...> type; }; - -template<typename T, std::size_t n, T V, T... nn> struct gen_numeric_list_repeated : gen_numeric_list_repeated<T, n-1, V, V, nn...> {}; -template<typename T, T V, T... nn> struct gen_numeric_list_repeated<T, 0, V, nn...> { typedef numeric_list<T, nn...> type; }; - -/* list manipulation: concatenate */ - -template<class a, class b> struct concat; - -template<typename... as, typename... bs> struct concat<type_list<as...>, type_list<bs...>> { typedef type_list<as..., bs...> type; }; -template<typename T, T... as, T... bs> struct concat<numeric_list<T, as...>, numeric_list<T, bs...> > { typedef numeric_list<T, as..., bs...> type; }; - -template<typename... p> struct mconcat; -template<typename a> struct mconcat<a> { typedef a type; }; -template<typename a, typename b> struct mconcat<a, b> : concat<a, b> {}; -template<typename a, typename b, typename... cs> struct mconcat<a, b, cs...> : concat<a, typename mconcat<b, cs...>::type> {}; - -/* list manipulation: extract slices */ - -template<int n, typename x> struct take; -template<int n, typename a, typename... as> struct take<n, type_list<a, as...>> : concat<type_list<a>, typename take<n-1, type_list<as...>>::type> {}; -template<int n> struct take<n, type_list<>> { typedef type_list<> type; }; -template<typename a, typename... as> struct take<0, type_list<a, as...>> { typedef type_list<> type; }; -template<> struct take<0, type_list<>> { typedef type_list<> type; }; - -template<typename T, int n, T a, T... as> struct take<n, numeric_list<T, a, as...>> : concat<numeric_list<T, a>, typename take<n-1, numeric_list<T, as...>>::type> {}; -template<typename T, int n> struct take<n, numeric_list<T>> { typedef numeric_list<T> type; }; -template<typename T, T a, T... as> struct take<0, numeric_list<T, a, as...>> { typedef numeric_list<T> type; }; -template<typename T> struct take<0, numeric_list<T>> { typedef numeric_list<T> type; }; - -template<typename T, int n, T... ii> struct h_skip_helper_numeric; -template<typename T, int n, T i, T... ii> struct h_skip_helper_numeric<T, n, i, ii...> : h_skip_helper_numeric<T, n-1, ii...> {}; -template<typename T, T i, T... ii> struct h_skip_helper_numeric<T, 0, i, ii...> { typedef numeric_list<T, i, ii...> type; }; -template<typename T, int n> struct h_skip_helper_numeric<T, n> { typedef numeric_list<T> type; }; -template<typename T> struct h_skip_helper_numeric<T, 0> { typedef numeric_list<T> type; }; - -template<int n, typename... tt> struct h_skip_helper_type; -template<int n, typename t, typename... tt> struct h_skip_helper_type<n, t, tt...> : h_skip_helper_type<n-1, tt...> {}; -template<typename t, typename... tt> struct h_skip_helper_type<0, t, tt...> { typedef type_list<t, tt...> type; }; -template<int n> struct h_skip_helper_type<n> { typedef type_list<> type; }; -template<> struct h_skip_helper_type<0> { typedef type_list<> type; }; -#endif //not EIGEN_PARSED_BY_DOXYGEN - -template<int n> -struct h_skip { - template<typename T, T... ii> - constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_numeric<T, n, ii...>::type helper(numeric_list<T, ii...>) { return typename h_skip_helper_numeric<T, n, ii...>::type(); } - template<typename... tt> - constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) { return typename h_skip_helper_type<n, tt...>::type(); } -}; - -template<int n, typename a> struct skip { typedef decltype(h_skip<n>::helper(a())) type; }; - -template<int start, int count, typename a> struct slice : take<count, typename skip<start, a>::type> {}; - -/* list manipulation: retrieve single element from list */ - -template<int n, typename x> struct get; - -template<int n, typename a, typename... as> struct get<n, type_list<a, as...>> : get<n-1, type_list<as...>> {}; -template<typename a, typename... as> struct get<0, type_list<a, as...>> { typedef a type; }; - -template<typename T, int n, T a, T... as> struct get<n, numeric_list<T, a, as...>> : get<n-1, numeric_list<T, as...>> {}; -template<typename T, T a, T... as> struct get<0, numeric_list<T, a, as...>> { constexpr static T value = a; }; - -template<std::size_t n, typename T, T a, T... as> constexpr T array_get(const numeric_list<T, a, as...>&) { - return get<(int)n, numeric_list<T, a, as...>>::value; -} - -/* always get type, regardless of dummy; good for parameter pack expansion */ - -template<typename T, T dummy, typename t> struct id_numeric { typedef t type; }; -template<typename dummy, typename t> struct id_type { typedef t type; }; - -/* equality checking, flagged version */ - -template<typename a, typename b> struct is_same_gf : is_same<a, b> { constexpr static int global_flags = 0; }; - -/* apply_op to list */ - -template< - bool from_left, // false - template<typename, typename> class op, - typename additional_param, - typename... values -> -struct h_apply_op_helper { typedef type_list<typename op<values, additional_param>::type...> type; }; -template< - template<typename, typename> class op, - typename additional_param, - typename... values -> -struct h_apply_op_helper<true, op, additional_param, values...> { typedef type_list<typename op<additional_param, values>::type...> type; }; - -template< - bool from_left, - template<typename, typename> class op, - typename additional_param -> -struct h_apply_op -{ - template<typename... values> - constexpr static typename h_apply_op_helper<from_left, op, additional_param, values...>::type helper(type_list<values...>) - { return typename h_apply_op_helper<from_left, op, additional_param, values...>::type(); } -}; - -template< - template<typename, typename> class op, - typename additional_param, - typename a -> -struct apply_op_from_left { typedef decltype(h_apply_op<true, op, additional_param>::helper(a())) type; }; - -template< - template<typename, typename> class op, - typename additional_param, - typename a -> -struct apply_op_from_right { typedef decltype(h_apply_op<false, op, additional_param>::helper(a())) type; }; - -/* see if an element is in a list */ - -template< - template<typename, typename> class test, - typename check_against, - typename h_list, - bool last_check_positive = false -> -struct contained_in_list; - -template< - template<typename, typename> class test, - typename check_against, - typename h_list -> -struct contained_in_list<test, check_against, h_list, true> -{ - constexpr static bool value = true; -}; - -template< - template<typename, typename> class test, - typename check_against, - typename a, - typename... as -> -struct contained_in_list<test, check_against, type_list<a, as...>, false> : contained_in_list<test, check_against, type_list<as...>, test<check_against, a>::value> {}; - -template< - template<typename, typename> class test, - typename check_against - EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty) -> -struct contained_in_list<test, check_against, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, false> { constexpr static bool value = false; }; - -/* see if an element is in a list and check for global flags */ - -template< - template<typename, typename> class test, - typename check_against, - typename h_list, - int default_flags = 0, - bool last_check_positive = false, - int last_check_flags = default_flags -> -struct contained_in_list_gf; - -template< - template<typename, typename> class test, - typename check_against, - typename h_list, - int default_flags, - int last_check_flags -> -struct contained_in_list_gf<test, check_against, h_list, default_flags, true, last_check_flags> -{ - constexpr static bool value = true; - constexpr static int global_flags = last_check_flags; -}; - -template< - template<typename, typename> class test, - typename check_against, - typename a, - typename... as, - int default_flags, - int last_check_flags -> -struct contained_in_list_gf<test, check_against, type_list<a, as...>, default_flags, false, last_check_flags> : contained_in_list_gf<test, check_against, type_list<as...>, default_flags, test<check_against, a>::value, test<check_against, a>::global_flags> {}; - -template< - template<typename, typename> class test, - typename check_against - EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty), - int default_flags, - int last_check_flags -> -struct contained_in_list_gf<test, check_against, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, default_flags, false, last_check_flags> { constexpr static bool value = false; constexpr static int global_flags = default_flags; }; - -/* generic reductions */ - -template< - typename Reducer, - typename... Ts -> struct reduce; - -template< - typename Reducer -> struct reduce<Reducer> -{ - EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE int run() { return Reducer::Identity; } -}; - -template< - typename Reducer, - typename A -> struct reduce<Reducer, A> -{ - EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE A run(A a) { return a; } -}; - -template< - typename Reducer, - typename A, - typename... Ts -> struct reduce<Reducer, A, Ts...> -{ - EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) { - return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...)); - } -}; - -/* generic binary operations */ - -struct sum_op { - template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a + b) { return a + b; } - static constexpr int Identity = 0; -}; -struct product_op { - template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a * b) { return a * b; } - static constexpr int Identity = 1; -}; - -struct logical_and_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a && b) { return a && b; } }; -struct logical_or_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a || b) { return a || b; } }; - -struct equal_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a == b) { return a == b; } }; -struct not_equal_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a != b) { return a != b; } }; -struct lesser_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a < b) { return a < b; } }; -struct lesser_equal_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a <= b) { return a <= b; } }; -struct greater_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a > b) { return a > b; } }; -struct greater_equal_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a >= b) { return a >= b; } }; - -/* generic unary operations */ - -struct not_op { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(!a) { return !a; } }; -struct negation_op { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(-a) { return -a; } }; -struct greater_equal_zero_op { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(a >= 0) { return a >= 0; } }; - - -/* reductions for lists */ - -// using auto -> return value spec makes ICC 13.0 and 13.1 crash here, so we have to hack it -// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1 -// does... -template<typename... Ts> -EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts) -{ - return reduce<product_op, Ts...>::run(ts...); -} - -template<typename... Ts> -constexpr EIGEN_STRONG_INLINE decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts) -{ - return reduce<sum_op, Ts...>::run(ts...); -} - -/* reverse arrays */ - -template<typename Array, int... n> -constexpr EIGEN_STRONG_INLINE Array h_array_reverse(Array arr, numeric_list<int, n...>) -{ - return {{array_get<sizeof...(n) - n - 1>(arr)...}}; -} - -template<typename T, std::size_t N> -constexpr EIGEN_STRONG_INLINE array<T, N> array_reverse(array<T, N> arr) -{ - return h_array_reverse(arr, typename gen_numeric_list<int, N>::type()); -} - - -/* generic array reductions */ - -// can't reuse standard reduce() interface above because Intel's Compiler -// *really* doesn't like it, so we just reimplement the stuff -// (start from N - 1 and work down to 0 because specialization for -// n == N - 1 also doesn't work in Intel's compiler, so it goes into -// an infinite loop) -template<typename Reducer, typename T, std::size_t N, std::size_t n = N - 1> -struct h_array_reduce { - EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(array<T, N> arr, T identity) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr))) - { - return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr)); - } -}; - -template<typename Reducer, typename T, std::size_t N> -struct h_array_reduce<Reducer, T, N, 0> -{ - EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, N>& arr, T) - { - return array_get<0>(arr); - } -}; - -template<typename Reducer, typename T> -struct h_array_reduce<Reducer, T, 0> -{ - EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, 0>&, T identity) - { - return identity; - } -}; - -template<typename Reducer, typename T, std::size_t N> -EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_reduce(const array<T, N>& arr, T identity) -> decltype(h_array_reduce<Reducer, T, N>::run(arr, identity)) -{ - return h_array_reduce<Reducer, T, N>::run(arr, identity); -} - -/* standard array reductions */ - -template<typename T, std::size_t N> -EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_sum(const array<T, N>& arr) -> decltype(array_reduce<sum_op, T, N>(arr, static_cast<T>(0))) -{ - return array_reduce<sum_op, T, N>(arr, static_cast<T>(0)); -} - -template<typename T, std::size_t N> -EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_prod(const array<T, N>& arr) -> decltype(array_reduce<product_op, T, N>(arr, static_cast<T>(1))) -{ - return array_reduce<product_op, T, N>(arr, static_cast<T>(1)); -} - -template<typename t> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) { - eigen_assert(a.size() > 0); - t prod = 1; - for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; } - return prod; -} - -/* zip an array */ - -template<typename Op, typename A, typename B, std::size_t N, int... n> -constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())),N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>) -{ - return array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }}; -} - -template<typename Op, typename A, typename B, std::size_t N> -constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())),N> array_zip(array<A, N> a, array<B, N> b) -{ - return h_array_zip<Op>(a, b, typename gen_numeric_list<int, N>::type()); -} - -/* zip an array and reduce the result */ - -template<typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n> -constexpr EIGEN_STRONG_INLINE auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...)) -{ - return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...); -} - -template<typename Reducer, typename Op, typename A, typename B, std::size_t N> -constexpr EIGEN_STRONG_INLINE auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type())) -{ - return h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type()); -} - -/* apply stuff to an array */ - -template<typename Op, typename A, std::size_t N, int... n> -constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())),N> h_array_apply(array<A, N> a, numeric_list<int, n...>) -{ - return array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }}; -} - -template<typename Op, typename A, std::size_t N> -constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())),N> array_apply(array<A, N> a) -{ - return h_array_apply<Op>(a, typename gen_numeric_list<int, N>::type()); -} - -/* apply stuff to an array and reduce */ - -template<typename Reducer, typename Op, typename A, std::size_t N, int... n> -constexpr EIGEN_STRONG_INLINE auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...)) -{ - return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...); -} - -template<typename Reducer, typename Op, typename A, std::size_t N> -constexpr EIGEN_STRONG_INLINE auto array_apply_and_reduce(array<A, N> a) -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type())) -{ - return h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type()); -} - -/* repeat a value n times (and make an array out of it - * usage: - * array<int, 16> = repeat<16>(42); - */ - -template<int n> -struct h_repeat -{ - template<typename t, int... ii> - constexpr static EIGEN_STRONG_INLINE array<t, n> run(t v, numeric_list<int, ii...>) - { - return {{ typename id_numeric<int, ii, t>::type(v)... }}; - } -}; - -template<int n, typename t> -constexpr array<t, n> repeat(t v) { return h_repeat<n>::run(v, typename gen_numeric_list<int, n>::type()); } - -/* instantiate a class by a C-style array */ -template<class InstType, typename ArrType, std::size_t N, bool Reverse, typename... Ps> -struct h_instantiate_by_c_array; - -template<class InstType, typename ArrType, std::size_t N, typename... Ps> -struct h_instantiate_by_c_array<InstType, ArrType, N, false, Ps...> -{ - static InstType run(ArrType* arr, Ps... args) - { - return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, Ps..., ArrType>::run(arr + 1, args..., arr[0]); - } -}; - -template<class InstType, typename ArrType, std::size_t N, typename... Ps> -struct h_instantiate_by_c_array<InstType, ArrType, N, true, Ps...> -{ - static InstType run(ArrType* arr, Ps... args) - { - return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, ArrType, Ps...>::run(arr + 1, arr[0], args...); - } -}; - -template<class InstType, typename ArrType, typename... Ps> -struct h_instantiate_by_c_array<InstType, ArrType, 0, false, Ps...> -{ - static InstType run(ArrType* arr, Ps... args) - { - (void)arr; - return InstType(args...); - } -}; - -template<class InstType, typename ArrType, typename... Ps> -struct h_instantiate_by_c_array<InstType, ArrType, 0, true, Ps...> -{ - static InstType run(ArrType* arr, Ps... args) - { - (void)arr; - return InstType(args...); - } -}; - -template<class InstType, typename ArrType, std::size_t N, bool Reverse = false> -InstType instantiate_by_c_array(ArrType* arr) -{ - return h_instantiate_by_c_array<InstType, ArrType, N, Reverse>::run(arr); -} - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_CXX11META_H diff --git a/src/EigenUnsupported/CXX11/src/util/CXX11Workarounds.h b/src/EigenUnsupported/CXX11/src/util/CXX11Workarounds.h deleted file mode 100644 index 056736c..0000000 --- a/src/EigenUnsupported/CXX11/src/util/CXX11Workarounds.h +++ /dev/null @@ -1,88 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11WORKAROUNDS_H -#define EIGEN_CXX11WORKAROUNDS_H - -/* COMPATIBILITY CHECKS - * (so users of compilers that are too old get some realistic error messages) - */ -#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1310) -#error Intel Compiler only supports required C++ features since version 13.1. -// note that most stuff in principle works with 13.0 but when combining -// some features, at some point 13.0 will just fail with an internal assertion -#elif defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6)) -// G++ < 4.6 by default will continue processing the source files - even if we use #error to make -// it error out. For this reason, we use the pragma to make sure G++ aborts at the first error -// it sees. Unfortunately, that is still not our #error directive, but at least the output is -// short enough the user has a chance to see that the compiler version is not sufficient for -// the funky template mojo we use. -#pragma GCC diagnostic error "-Wfatal-errors" -#error GNU C++ Compiler (g++) only supports required C++ features since version 4.6. -#endif - -/* Check that the compiler at least claims to support C++11. It might not be sufficient - * because the compiler may not implement it correctly, but at least we'll know. - * On the other hand, visual studio still doesn't claim to support C++11 although it's - * compliant enugh for our purpose. - */ -#if (EIGEN_COMP_CXXVER < 11) -#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) -#pragma GCC diagnostic error "-Wfatal-errors" -#endif -#error This library needs at least a C++11 compliant compiler. If you use g++/clang, please enable the -std=c++11 compiler flag. (-std=c++0x on older versions.) -#endif - -namespace Eigen { - -namespace internal { - -/* std::get is only constexpr in C++14, not yet in C++11 - */ - - -template<std::size_t I_, class T> constexpr inline T& array_get(std::vector<T>& a) { return a[I_]; } -template<std::size_t I_, class T> constexpr inline T&& array_get(std::vector<T>&& a) { return a[I_]; } -template<std::size_t I_, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I_]; } - -/* Suppose you have a template of the form - * template<typename T> struct X; - * And you want to specialize it in such a way: - * template<typename S1, typename... SN> struct X<Foo<S1, SN...>> { ::: }; - * template<> struct X<Foo<>> { ::: }; - * This will work in Intel's compiler 13.0, but only to some extent in g++ 4.6, since - * g++ can only match templates called with parameter packs if the number of template - * arguments is not a fixed size (so inside the first specialization, referencing - * X<Foo<Sn...>> will fail in g++). On the other hand, g++ will accept the following: - * template<typename S...> struct X<Foo<S...>> { ::: }: - * as an additional (!) specialization, which will then only match the empty case. - * But Intel's compiler 13.0 won't accept that, it will only accept the empty syntax, - * so we have to create a workaround for this. - */ -#if defined(__GNUC__) && !defined(__INTEL_COMPILER) -#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n) mt... n -#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n) , EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n) -#define EIGEN_TPL_PP_SPEC_HACK_USE(n) n... -#define EIGEN_TPL_PP_SPEC_HACK_USEC(n) , n... -#else -#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n) -#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n) -#define EIGEN_TPL_PP_SPEC_HACK_USE(n) -#define EIGEN_TPL_PP_SPEC_HACK_USEC(n) -#endif - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_CXX11WORKAROUNDS_H - -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ diff --git a/src/EigenUnsupported/CXX11/src/util/EmulateArray.h b/src/EigenUnsupported/CXX11/src/util/EmulateArray.h deleted file mode 100644 index 834b20b..0000000 --- a/src/EigenUnsupported/CXX11/src/util/EmulateArray.h +++ /dev/null @@ -1,261 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_EMULATE_ARRAY_H -#define EIGEN_EMULATE_ARRAY_H - - - -// The array class is only available starting with cxx11. Emulate our own here -// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler! -// Moreover, CUDA doesn't support the STL containers, so we use our own instead. -#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_GPUCC) || defined(EIGEN_AVOID_STL_ARRAY) - -namespace Eigen { -template <typename T, size_t n> class array { - public: - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE T& operator[] (size_t index) { eigen_internal_assert(index < size()); return values[index]; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { eigen_internal_assert(index < size()); return values[index]; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE T& at(size_t index) { eigen_assert(index < size()); return values[index]; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const T& at(size_t index) const { eigen_assert(index < size()); return values[index]; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE T& front() { return values[0]; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const T& front() const { return values[0]; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE T& back() { return values[n-1]; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const T& back() const { return values[n-1]; } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - static std::size_t size() { return n; } - - T values[n]; - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array() { } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v) { - EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE) - values[0] = v; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v1, const T& v2) { - EIGEN_STATIC_ASSERT(n==2, YOU_MADE_A_PROGRAMMING_MISTAKE) - values[0] = v1; - values[1] = v2; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) { - EIGEN_STATIC_ASSERT(n==3, YOU_MADE_A_PROGRAMMING_MISTAKE) - values[0] = v1; - values[1] = v2; - values[2] = v3; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, - const T& v4) { - EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE) - values[0] = v1; - values[1] = v2; - values[2] = v3; - values[3] = v4; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, - const T& v5) { - EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE) - values[0] = v1; - values[1] = v2; - values[2] = v3; - values[3] = v4; - values[4] = v5; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, - const T& v5, const T& v6) { - EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE) - values[0] = v1; - values[1] = v2; - values[2] = v3; - values[3] = v4; - values[4] = v5; - values[5] = v6; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, - const T& v5, const T& v6, const T& v7) { - EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE) - values[0] = v1; - values[1] = v2; - values[2] = v3; - values[3] = v4; - values[4] = v5; - values[5] = v6; - values[6] = v7; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array( - const T& v1, const T& v2, const T& v3, const T& v4, - const T& v5, const T& v6, const T& v7, const T& v8) { - EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE) - values[0] = v1; - values[1] = v2; - values[2] = v3; - values[3] = v4; - values[4] = v5; - values[5] = v6; - values[6] = v7; - values[7] = v8; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(std::initializer_list<T> l) { - eigen_assert(l.size() == n); - internal::smart_copy(l.begin(), l.end(), values); - } -#endif -}; - - -// Specialize array for zero size -template <typename T> class array<T, 0> { - public: - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE T& operator[] (size_t) { - eigen_assert(false && "Can't index a zero size array"); - return dummy; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const T& operator[] (size_t) const { - eigen_assert(false && "Can't index a zero size array"); - return dummy; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE T& front() { - eigen_assert(false && "Can't index a zero size array"); - return dummy; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const T& front() const { - eigen_assert(false && "Can't index a zero size array"); - return dummy; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE T& back() { - eigen_assert(false && "Can't index a zero size array"); - return dummy; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const T& back() const { - eigen_assert(false && "Can't index a zero size array"); - return dummy; - } - - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array() : dummy() { } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() { - EIGEN_UNUSED_VARIABLE(l); - eigen_assert(l.size() == 0); - } -#endif - - private: - T dummy; -}; - -// Comparison operator -// Todo: implement !=, <, <=, >, and >= -template<class T, std::size_t N> -EIGEN_DEVICE_FUNC bool operator==(const array<T,N>& lhs, const array<T,N>& rhs) { - for (std::size_t i = 0; i < N; ++i) { - if (lhs[i] != rhs[i]) { - return false; - } - } - return true; -} - - -namespace internal { -template<std::size_t I_, class T, std::size_t N> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) { - return a[I_]; -} -template<std::size_t I_, class T, std::size_t N> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) { - return a[I_]; -} - -template<class T, std::size_t N> struct array_size<array<T,N> > { - enum { value = N }; -}; -template<class T, std::size_t N> struct array_size<array<T,N>& > { - enum { value = N }; -}; -template<class T, std::size_t N> struct array_size<const array<T,N> > { - enum { value = N }; -}; -template<class T, std::size_t N> struct array_size<const array<T,N>& > { - enum { value = N }; -}; - -} // end namespace internal -} // end namespace Eigen - -#else - -// The compiler supports c++11, and we're not targeting cuda: use std::array as Eigen::array -#include <array> -namespace Eigen { - -template <typename T, std::size_t N> using array = std::array<T, N>; - -namespace internal { -/* std::get is only constexpr in C++14, not yet in C++11 - * - libstdc++ from version 4.7 onwards has it nevertheless, - * so use that - * - libstdc++ older versions: use _M_instance directly - * - libc++ all versions so far: use __elems_ directly - * - all other libs: use std::get to be portable, but - * this may not be constexpr - */ -#if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322 -#define STD_GET_ARR_HACK a._M_instance[I_] -#elif defined(_LIBCPP_VERSION) -#define STD_GET_ARR_HACK a.__elems_[I_] -#else -#define STD_GET_ARR_HACK std::template get<I_, T, N>(a) -#endif - -template<std::size_t I_, class T, std::size_t N> constexpr inline T& array_get(std::array<T,N>& a) { return (T&) STD_GET_ARR_HACK; } -template<std::size_t I_, class T, std::size_t N> constexpr inline T&& array_get(std::array<T,N>&& a) { return (T&&) STD_GET_ARR_HACK; } -template<std::size_t I_, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; } - -#undef STD_GET_ARR_HACK - -} // end namespace internal -} // end namespace Eigen - -#endif - -#endif // EIGEN_EMULATE_ARRAY_H diff --git a/src/EigenUnsupported/CXX11/src/util/MaxSizeVector.h b/src/EigenUnsupported/CXX11/src/util/MaxSizeVector.h deleted file mode 100644 index 277ab14..0000000 --- a/src/EigenUnsupported/CXX11/src/util/MaxSizeVector.h +++ /dev/null @@ -1,158 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_FIXEDSIZEVECTOR_H -#define EIGEN_FIXEDSIZEVECTOR_H - -namespace Eigen { - -/** \class MaxSizeVector - * \ingroup Core - * - * \brief The MaxSizeVector class. - * - * The %MaxSizeVector provides a subset of std::vector functionality. - * - * The goal is to provide basic std::vector operations when using - * std::vector is not an option (e.g. on GPU or when compiling using - * FMA/AVX, as this can cause either compilation failures or illegal - * instruction failures). - * - * Beware: The constructors are not API compatible with these of - * std::vector. - */ -template <typename T> -class MaxSizeVector { - static const size_t alignment = EIGEN_PLAIN_ENUM_MAX(EIGEN_ALIGNOF(T), sizeof(void*)); - public: - // Construct a new MaxSizeVector, reserve n elements. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit MaxSizeVector(size_t n) - : reserve_(n), size_(0), - data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) { - } - - // Construct a new MaxSizeVector, reserve and resize to n. - // Copy the init value to all elements. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - MaxSizeVector(size_t n, const T& init) - : reserve_(n), size_(n), - data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) { - size_t i = 0; - EIGEN_TRY - { - for(; i < size_; ++i) { new (&data_[i]) T(init); } - } - EIGEN_CATCH(...) - { - // Construction failed, destruct in reverse order: - for(; (i+1) > 0; --i) { data_[i-1].~T(); } - internal::handmade_aligned_free(data_); - EIGEN_THROW; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - ~MaxSizeVector() { - for (size_t i = size_; i > 0; --i) { - data_[i-1].~T(); - } - internal::handmade_aligned_free(data_); - } - - void resize(size_t n) { - eigen_assert(n <= reserve_); - for (; size_ < n; ++size_) { - new (&data_[size_]) T; - } - for (; size_ > n; --size_) { - data_[size_-1].~T(); - } - eigen_assert(size_ == n); - } - - // Append new elements (up to reserved size). - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void push_back(const T& t) { - eigen_assert(size_ < reserve_); - new (&data_[size_++]) T(t); - } - - // For C++03 compatibility this only takes one argument - template<class X> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void emplace_back(const X& x) { - eigen_assert(size_ < reserve_); - new (&data_[size_++]) T(x); - } - - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const T& operator[] (size_t i) const { - eigen_assert(i < size_); - return data_[i]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - T& operator[] (size_t i) { - eigen_assert(i < size_); - return data_[i]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - T& back() { - eigen_assert(size_ > 0); - return data_[size_ - 1]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const T& back() const { - eigen_assert(size_ > 0); - return data_[size_ - 1]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void pop_back() { - eigen_assert(size_ > 0); - data_[--size_].~T(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - size_t size() const { return size_; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - bool empty() const { return size_ == 0; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - T* data() { return data_; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const T* data() const { return data_; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - T* begin() { return data_; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - T* end() { return data_ + size_; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const T* begin() const { return data_; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const T* end() const { return data_ + size_; } - - private: - size_t reserve_; - size_t size_; - T* data_; -}; - -} // namespace Eigen - -#endif // EIGEN_FIXEDSIZEVECTOR_H |